ZYB ARTICLES REPOS

笔记:Linux-2.4.0文件系统

块设备读数据

ext2从硬盘读一数据到内存的调用链

ext2_readpage(page)
    -> block_read_full_page(page, get_block) # get_block = ext2_get_block, fs/buffer.c
        -> blocksize = inode->i_sb->s_blocksize;
        -> create_empty_buffers(page, inode->i_dev, blocksize) # 如果page->buffers 结果会放到 page->buffers 里
        -> head = page->buffers;
        -> ext2_get_block(inode, iblock, bh) # iblock 是根据page计算出来的
            -> ext2_block_to_path(inode, iblock, offsets)  # offsets[depth-1].key 保存的是最终的物理block号
            -> ext2_get_branch(inode, depth, offsets, chain[4], err) # 会根据 offsets 读入各个block, 直到读入最终的物理block号
                -> bread(dev, block, size) # fs/buffer.c
                    # fs/buffer.c 从hash表中找出buffer_head, 或从free_list[isize]里分配一个buffer_head;
                    # 此处的bh应该是create_empty_buffers里生成或已经存在的bh
                    -> bh = getblk(dev, block, size)
                        -> __get_hash_table(dev, block, size) # 遍历的是buffer_head
                        -> free_list[isize]
                    -> ll_rw_block(READ, 1, &bh)
                    -> wait_on_buffer(bh)

缓冲页

include/linux/fs.h:365
struct address_space {
	struct list_head	clean_pages;	/* list of clean pages */
	struct list_head	dirty_pages;	/* list of dirty pages */
	struct list_head	locked_pages;	/* list of locked pages */
	unsigned long		nrpages;	/* number of total pages */
	struct address_space_operations *a_ops;	/* methods */
	struct inode		*host;		/* owner: inode, block_device */
	struct vm_area_struct	*i_mmap;	/* list of private mappings */
	struct vm_area_struct	*i_mmap_shared; /* list of shared mappings */
	spinlock_t		i_shared_lock;  /* and spinlock protecting it */
};
include/linux/fs.h:387
struct inode {
	struct list_head	i_hash;
	struct list_head	i_list;
	struct list_head	i_dentry;

	struct list_head	i_dirty_buffers;

	unsigned long		i_ino;
	atomic_t		i_count;
	kdev_t			i_dev;
	umode_t			i_mode;
	nlink_t			i_nlink;
	uid_t			i_uid;
	gid_t			i_gid;
	kdev_t			i_rdev;
	loff_t			i_size;
	time_t			i_atime;
	time_t			i_mtime;
	time_t			i_ctime;
	unsigned long		i_blksize;
	unsigned long		i_blocks;
	unsigned long		i_version;
	struct semaphore	i_sem;
	struct semaphore	i_zombie;
	struct inode_operations	*i_op;
	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
	struct super_block	*i_sb;
	wait_queue_head_t	i_wait;
	struct file_lock	*i_flock;
    // i_mapping 通常指向 i_data,挂载的就是这个inode上的缓冲的页
    // 也就是说文件内容并不是以记录块为单位,而是以内存页面为单位进行缓冲的
    // 如果记录块的大小为1K字节,那么一个页面(4K)相当于4个记录块
    // 这样做是为了将文件内容缓冲与文件内存映射结合在一起
	struct address_space	*i_mapping;
	struct address_space	i_data;
	struct dquot		*i_dquot[MAXQUOTAS];
	struct pipe_inode_info	*i_pipe;
	struct block_device	*i_bdev;

	unsigned long		i_dnotify_mask; /* Directory notify events */
	struct dnotify_struct	*i_dnotify; /* for directory notifications */

	unsigned long		i_state;

	unsigned int		i_flags;
	unsigned char		i_sock;

	atomic_t		i_writecount;
	unsigned int		i_attr_flags;
	__u32			i_generation;
	union {
		struct minix_inode_info		minix_i;
		struct ext2_inode_info		ext2_i;
		struct hpfs_inode_info		hpfs_i;
		struct ntfs_inode_info		ntfs_i;
		struct msdos_inode_info		msdos_i;
		struct umsdos_inode_info	umsdos_i;
		struct iso_inode_info		isofs_i;
		struct nfs_inode_info		nfs_i;
		struct sysv_inode_info		sysv_i;
		struct affs_inode_info		affs_i;
		struct ufs_inode_info		ufs_i;
		struct efs_inode_info		efs_i;
		struct romfs_inode_info		romfs_i;
		struct shmem_inode_info		shmem_i;
		struct coda_inode_info		coda_i;
		struct smb_inode_info		smbfs_i;
		struct hfs_inode_info		hfs_i;
		struct adfs_inode_info		adfs_i;
		struct qnx4_inode_info		qnx4_i;
		struct bfs_inode_info		bfs_i;
		struct udf_inode_info		udf_i;
		struct ncp_inode_info		ncpfs_i;
		struct proc_inode_info		proc_i;
		struct socket			socket_i;
		struct usbdev_inode_info        usbdev_i;
		void				*generic_ip;
	} u;
};
include/linux/fs.h:209
/*
 * Try to keep the most commonly used fields in single cache lines (16
 * bytes) to improve performance.  This ordering should be
 * particularly beneficial on 32-bit processors.
 *
 * We use the first 16 bytes for the data which is used in searches
 * over the block hash lists (ie. getblk() and friends).
 *
 * The second 16 bytes we use for lru buffer scans, as used by
 * sync_buffers() and refill_freelist().  -- sct
 */
struct buffer_head {
	/* First cache line: */
	struct buffer_head *b_next;	/* Hash queue list */
	unsigned long b_blocknr;	/* block number */
	unsigned short b_size;		/* block size */
	unsigned short b_list;		/* List that this buffer appears */
	kdev_t b_dev;			/* device (B_FREE = free) */

	atomic_t b_count;		/* users using this block */
	kdev_t b_rdev;			/* Real device */
	unsigned long b_state;		/* buffer state bitmap (see above) */
	unsigned long b_flushtime;	/* Time when (dirty) buffer should be written */

	struct buffer_head *b_next_free;/* lru/free list linkage */
	struct buffer_head *b_prev_free;/* doubly linked list of buffers */
	struct buffer_head *b_this_page;/* circular list of buffers in one page */
	struct buffer_head *b_reqnext;	/* request queue */

	struct buffer_head **b_pprev;	/* doubly linked list of hash-queue */
	char * b_data;			/* pointer to data block (512 byte) */
	struct page *b_page;		/* the page this bh is mapped to */
	void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
 	void *b_private;		/* reserved for b_end_io */

	unsigned long b_rsector;	/* Real buffer location on disk */
	wait_queue_head_t b_wait;

	struct inode *	     b_inode;
	struct list_head     b_inode_buffers;	/* doubly linked list of inode dirty buffers */
};

以一个缓冲页面为例

  1. 在文件层它通过一个page数据结构挂入所属inode结果的缓冲页面队列
  2. 又可以通过各个进程的页面映射表映射到这些进程的内存空间
  3. 在设备层则可通过若干个buffer_head结构拷入其所在的设备的缓冲队列

这样以页面为单位为文件内容建立缓冲是 一举三得

缓冲页面的page结构除了链入inode->i_mapping(address_space)的页面缓冲队列外,同时也链入到了一个杂凑表page_hash_table ????中的杂凑队列中

mm/filemap.c:45
unsigned int page_hash_bits;
struct page **page_hash_table;
include/linux/pagemap.h:50
/*
 * We use a power-of-two hash table to avoid a modulus,
 * and get a reasonable hash by knowing roughly how the
 * inode pointer and indexes are distributed (ie, we
 * roughly know which bits are "significant")
 *
 * For the time being it will work for struct address_space too (most of
 * them sitting inside the inodes). We might want to change it later.
 */
extern inline unsigned long _page_hashfn(struct address_space * mapping, unsigned long index)
{
#define i (((unsigned long) mapping)/(sizeof(struct inode) & ~ (sizeof(struct inode) - 1)))
#define s(x) ((x)+((x)>>PAGE_HASH_BITS))
	return s(i+index) & (PAGE_HASH_SIZE-1);
#undef i
#undef s
}

#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
mm/filemap.c:525
static int add_to_page_cache_unique(struct page * page,
	struct address_space *mapping, unsigned long offset,
	struct page **hash)
{
	int err;
	struct page *alias;

	spin_lock(&pagecache_lock);
	alias = __find_page_nolock(mapping, offset, *hash);

	err = 1;
	if (!alias) {
		__add_to_page_cache(page,mapping,offset,hash);
		err = 0;
	}

	spin_unlock(&pagecache_lock);
	return err;
}
mm/filemap.c:496
/*
 * This adds a page to the page cache, starting out as locked,
 * owned by us, but unreferenced, not uptodate and with no errors.
 */
static inline void __add_to_page_cache(struct page * page,
	struct address_space *mapping, unsigned long offset,
	struct page **hash)
{
	unsigned long flags;

	if (PageLocked(page))
		BUG();

	flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_dirty) | (1 << PG_referenced) | (1 << PG_arch_1));
	page->flags = flags | (1 << PG_locked);
	page_cache_get(page);
	page->index = offset;
	add_page_to_inode_queue(mapping, page);
	add_page_to_hash_queue(page, hash);
	lru_cache_add(page);
}
fs/fs.c:77
static unsigned int bh_hash_mask;
static unsigned int bh_hash_shift;
static struct buffer_head **hash_table;
static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;

所以寻找目标页面的操作效率很高,并不需要在整个缓冲页面队列中线性搜索

此外缓冲页面的page结构还会被链入一个LRU队列,如果内存空间短缺同时一个页面又很久没有访问,就可以把它释放。

通常以页面为单位的缓冲而不是以记录块为单位的缓冲本身就隐含着预读的功能,因为一个页面通常包含4个记录块(如果记录块为1K大小)。只要访问位置不在其最后一个记录块中,就多少要预读几个记录块。

ext2 read 流程

sys_read 会调用file_operations里的read函数指针,对于ext2文件系统,这个函数就是generic_file_read

// fs/ext2/file.c:96
/*
 * We have mostly NULL's here: the current defaults are ok for
 * the ext2 filesystem.
 */
struct file_operations ext2_file_operations = {
	llseek:		ext2_file_lseek,
	read:		generic_file_read, // 定义在此处
	write:		generic_file_write,
	ioctl:		ext2_ioctl,
	mmap:		generic_file_mmap,
	open:		ext2_open_file,
	release:	ext2_release_file,
	fsync:		ext2_sync_file,
};

// mm/filemap.c:1237
/*
 * This is the "read()" routine for all filesystems
 * that can use the page cache directly.
 */
ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
{
	ssize_t retval;

	retval = -EFAULT;
	if (access_ok(VERIFY_WRITE, buf, count)) {
		retval = 0;

		if (count) {
			read_descriptor_t desc;

			desc.written = 0;
			desc.count = count;
			desc.buf = buf;
			desc.error = 0;
			do_generic_file_read(filp, ppos, &desc, file_read_actor);

			retval = desc.written;
			if (!retval)
				retval = desc.error;
		}
	}
	return retval;
}

显然这个函数是对do_generic_file_read的一个包装调用,在调用这个函数时的参数中的file_read_actor是一个函数,其作用就是将文件的内容从缓冲页面拷贝到用户空间的缓冲区中。

// mm/filemap.c:1215
static int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
{
	char *kaddr;
	unsigned long left, count = desc->count;

	if (size > count)
		size = count;
	// `kmap`的作用很简单,就是获得`page`对应的虚拟地址
	// include/linux/highmem.h:33
	// static inline void *kmap(struct page *page) { return page_address(page); }
	// include/asm-i386/pgtable.h:260
	// #define page_address(page) ((page)->virtual)
	kaddr = kmap(page);
	left = __copy_to_user(desc->buf, kaddr + offset, size);
	kunmap(page);

	if (left) {
		size -= left;
		desc->error = -EFAULT;
	}
	desc->count = count - size;
	desc->written += size;
	desc->buf += size;
	return size;
}

接下来分析do_generic_file_read,其中generic_file_readahead是异步执行的,可以暂时略过。

// mm/filemap.c:1005
/*
 * This is a generic file read routine, and uses the
 * inode->i_op->readpage() function for the actual low-level
 * stuff.
 *
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 */
void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
{
	struct inode *inode = filp->f_dentry->d_inode;
	struct address_space *mapping = inode->i_mapping;
	unsigned long index, offset;
	struct page *cached_page;
	int reada_ok;
	int error;
	int max_readahead = get_max_readahead(inode);

	cached_page = NULL;
	index = *ppos >> PAGE_CACHE_SHIFT;
	offset = *ppos & ~PAGE_CACHE_MASK;

/*
 * If the current position is outside the previous read-ahead window,
 * we reset the current read-ahead context and set read ahead max to zero
 * (will be set to just needed value later),
 * otherwise, we assume that the file accesses are sequential enough to
 * continue read-ahead.
 */
	if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
		reada_ok = 0;
		filp->f_raend = 0;
		filp->f_ralen = 0;
		filp->f_ramax = 0;
		filp->f_rawin = 0;
	} else {
		reada_ok = 1;
	}
/*
 * Adjust the current value of read-ahead max.
 * If the read operation stay in the first half page, force no readahead.
 * Otherwise try to increase read ahead max just enough to do the read request.
 * Then, at least MIN_READAHEAD if read ahead is ok,
 * and at most MAX_READAHEAD in all cases.
 */
	if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
		filp->f_ramax = 0;
	} else {
		unsigned long needed;

		needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;

		if (filp->f_ramax < needed)
			filp->f_ramax = needed;

		if (reada_ok && filp->f_ramax < MIN_READAHEAD)
				filp->f_ramax = MIN_READAHEAD;
		if (filp->f_ramax > max_readahead)
			filp->f_ramax = max_readahead;
	}

	for (;;) {
		struct page *page, **hash;
		unsigned long end_index, nr; // nr 代表要读的字节数

		// 获得文件结尾部分的页index
		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
		if (index > end_index) // 如果要读的位置超过了文件结尾的页index,则退出
			break;
		nr = PAGE_CACHE_SIZE; // 默认读一整页的数据
		if (index == end_index) { // 但如果是读最后一页,则只读有合法内容的前半部分
			nr = inode->i_size & ~PAGE_CACHE_MASK;
			if (nr <= offset)  // 如果要读的数据量还达不到算出来的偏移位置,则有问题退出
				break;
		}

		// 一般是要读的起始页有个偏移,这个偏移前的数据不用读
		nr = nr - offset;

		/*
		 * Try to find the data in the page cache..
		 */
		hash = page_hash(mapping, index);

		spin_lock(&pagecache_lock);
		page = __find_page_nolock(mapping, index, *hash);
		if (!page)
			goto no_cached_page; // 没有找到缓冲页面
found_page:
		page_cache_get(page);
		spin_unlock(&pagecache_lock);

		if (!Page_Uptodate(page))
			goto page_not_up_to_date; // 缓冲页面找到了,但是数据不一致
		generic_file_readahead(reada_ok, filp, inode, page);
page_ok:
		/* If users can be writing to this page using arbitrary
		 * virtual addresses, take care about potential aliasing
		 * before reading the page on the kernel side.
		 */
		if (mapping->i_mmap_shared != NULL)
			flush_dcache_page(page); // 这个在i386平台是定义成空的, include/asm-i386/pgtable.h:33:#define flush_dcache_page(page)                 do { } while (0)

		/*
		 * Ok, we have the page, and it's up-to-date, so
		 * now we can copy it to user space...
		 *
		 * The actor routine returns how many bytes were actually used..
		 * NOTE! This may not be the same as how much of a user buffer
		 * we filled up (we may be padding etc), so we can only update
		 * "pos" here (the actor routine has to update the user buffer
		 * pointers and the remaining count).
		 */
		nr = actor(desc, page, offset, nr);
		offset += nr;
		index += offset >> PAGE_CACHE_SHIFT;
		offset &= ~PAGE_CACHE_MASK;

		page_cache_release(page);
		if (nr && desc->count)
			continue;
		break;

/*
 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
 */
page_not_up_to_date:
		// 由于内容不一致,不能马上从这个页面读出数据
		// 页面内容不一致是一个暂时现象,这是由于正在写包括这个页面的一些页面,但尚未提交造成的
		// 一般只需要等一会儿就好了
		// 既然要等待,就不如乘机预读一些页面进来,所以通过generic_file_readahead启动预读(这是启动预读,不是完成预读,是异步的)
		generic_file_readahead(reada_ok, filp, inode, page);
		// 启动了预读再来检查一下是否一致
		if (Page_Uptodate(page))
			goto page_ok; // 如果一致,就转到page_ok处处理

		// 如果还是不一致
		// 那就要从设备上把这个页面读进来
		// 读之前要把这个页面锁住,这里的lock_page可能隐含着等待,因为这个页面可能被别的进程锁住
		// 特别是这个页面还不一致,就说明有某个进程在对这个页面写操作,很可能就是这个进程锁住了这个页面
		/* Get exclusive access to the page ... */
		lock_page(page);

		/* Did it get unhashed before we got the lock? */
		if (!page->mapping) {
			UnlockPage(page);
			page_cache_release(page);
			continue;
		}

		// 当从lock_page返回就代表当前进程锁住了这个页面
		// 正因为这样,很有可能加锁成功时页面已经一致了(其它进程已经处理好了这个page)
		// 所以要再次检查,如果确认一致就把锁解除并转向page_ok
		/* Did somebody else fill it already? */
		if (Page_Uptodate(page)) {
			UnlockPage(page);
			goto page_ok;
		}

		// 如果还是不一致,那就从设备上把这个页面读回来
readpage:
		/* ... and start the actual read. The read will unlock the page. */
		error = mapping->a_ops->readpage(filp, page);

		if (!error) {
			if (Page_Uptodate(page))
				goto page_ok;

			/* Again, try some read-ahead while waiting for the page to finish.. */
			// 因为上面逻辑readpage实际是异步执行的,所以此处数据不一致
			// 所以现在闲着也是闲着,就先异步再向前读一点数据
			generic_file_readahead(reada_ok, filp, inode, page);

			// 等待数据
			wait_on_page(page);
			if (Page_Uptodate(page))
				goto page_ok;

			// 如果数据还是没准备好,就是出错了
			error = -EIO;
		}

		/* UHHUH! A synchronous read error occurred. Report it */
		desc->error = error;
		// include/linux/pagemap.h:34:#define page_cache_release(x)      __free_page(x)
		// include/linux/mm.h:379:#define __free_page(page) __free_pages((page), 0)
		page_cache_release(page);
		break;

no_cached_page:
		/*
		 * Ok, it wasn't cached, so we need to create a new
		 * page..
		 *
		 * We get here with the page cache lock held.
		 */
		if (!cached_page) {
			spin_unlock(&pagecache_lock);
			// 如果没有找到,则分配一个缓冲页面
			cached_page = page_cache_alloc();
			if (!cached_page) {
				desc->error = -ENOMEM;
				break;
			}

			/*
			 * Somebody may have added the page while we
			 * dropped the page cache lock. Check for that.
			 */
			 // 当我们解锁后,其它进程也可能添加这个缓冲页面
			 // 所以这里要再检查一下
			 // 如果真找到了,就去执行found_page逻辑
			 // 这里暂时不释放cached_page,以便后续使用
			spin_lock(&pagecache_lock);
			page = __find_page_nolock(mapping, index, *hash);
			if (page)
				goto found_page;
		}

		/*
		 * Ok, add the new page to the hash-queues...
		 */
		// 如果还是没有,则将这个缓冲页面加入hash表,并跳到readpage真正读入数据
		page = cached_page;
		__add_to_page_cache(page, mapping, index, hash);
		spin_unlock(&pagecache_lock);
		cached_page = NULL;

		goto readpage;
	}

	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
	filp->f_reada = 1;
	if (cached_page)
		page_cache_free(cached_page); // include/linux/pagemap.h:33:#define page_cache_free(x) __free_page(x)
	UPDATE_ATIME(inode);
}

ext2 readpage

// fs/ext2/inode.c:657
static int ext2_readpage(struct file *file, struct page *page)
{
	return block_read_full_page(page,ext2_get_block);
}
// fs/buffer.c:1667
/*
 * Generic "read page" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the page asynchronously --- the unlock_buffer() and
 * mark_buffer_uptodate() functions propagate buffer state into the
 * page struct once IO has completed.
 */
int block_read_full_page(struct page *page, get_block_t *get_block)
{
	struct inode *inode = page->mapping->host;
	unsigned long iblock, lblock;
	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
	unsigned int blocksize, blocks;
	int nr, i;

	if (!PageLocked(page))
		PAGE_BUG(page);
	blocksize = inode->i_sb->s_blocksize;

	// 在page上分配建立buffer_head对其的映射
	if (!page->buffers)
		create_empty_buffers(page, inode->i_dev, blocksize);
	head = page->buffers;

	blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits;

	// 这个参数指定要读的block号
	iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
	lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits;
	bh = head;
	nr = 0;
	i = 0;

	do {
		// 一个页面数据不一致,并不意味着每一个纪录块不一致
		// 对于一致的数据块,就可以跳过不读
		if (buffer_uptodate(bh))
			continue;

		// 如果一个纪录块并未与设备上的纪录块建立映射关系
		if (!buffer_mapped(bh)) {
			// 并且这个记录块的起始地址并未超出文件末尾
			// 就要通过参数传递下来的get_block函数建立映射
			// 对于ext2文件系统而言get_block就是ext2_get_block
			if (iblock < lblock) {
				if (get_block(inode, iblock, bh, 0)) // 第四个参数是0,表示不分配对应的物理纪录块
					continue;
			}
			// 如果在调用ext2_get_block后仍然未建立起映射,就表示这个纪录块尚无与之对应的物理纪录块
			// 这种情况发生在调用lseek在文件中引入了空洞造成的
			// 那么就将之全部清0
			// 那么什么时候才为这个纪录块分配物理设备上的空间呢?
			// 需要等对这个纪录块调用写操作的时候,到那时写操作会调用 ext2_get_block ,那时传递的第四个参数是1
			// 就会为之分配物理记录块了
			if (!buffer_mapped(bh)) {
				memset(kmap(page) + i*blocksize, 0, blocksize);
				flush_dcache_page(page);
				kunmap(page);
				set_bit(BH_Uptodate, &bh->b_state);
				continue;
			}
			/* get_block() might have updated the buffer synchronously */
			if (buffer_uptodate(bh))
				continue;
		}

		arr[nr] = bh;
		nr++;
	} while (i++, iblock++, (bh = bh->b_this_page) != head);

	if (!nr) {
		/*
		 * all buffers are uptodate - we can set the page
		 * uptodate as well.
		 */
		SetPageUptodate(page);
		UnlockPage(page);
		return 0;
	}

	/* Stage two: lock the buffers */
	for (i = 0; i < nr; i++) {
		struct buffer_head * bh = arr[i];
		lock_buffer(bh);
		bh->b_end_io = end_buffer_io_async;
		atomic_inc(&bh->b_count);
	}

	/* Stage 3: start the IO */
	for (i = 0; i < nr; i++)
		submit_bh(READ, arr[i]);

	return 0;
}
// fs/buffer.c:1426
static void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
{
	struct buffer_head *bh, *head, *tail;

	head = create_buffers(page, blocksize, 1);
	if (page->buffers)
		BUG();

	bh = head;
	do {
		bh->b_dev = dev;
		bh->b_blocknr = 0;
		bh->b_end_io = NULL;
		tail = bh;
		bh = bh->b_this_page;
	} while (bh);
	tail->b_this_page = head;
	page->buffers = head;
	page_cache_get(page);
}
// fs/buffer.c:1282
/*
 * Create the appropriate buffers when given a page for data area and
 * the size of each buffer.. Use the bh->b_this_page linked list to
 * follow the buffers created.  Return NULL if unable to create more
 * buffers.
 * The async flag is used to differentiate async IO (paging, swapping)
 * from ordinary buffer allocations, and only async requests are allowed
 * to sleep waiting for buffer heads.
 */
static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
{
	struct buffer_head *bh, *head;
	long offset;

try_again:
	head = NULL;
	offset = PAGE_SIZE;
	while ((offset -= size) >= 0) {
		bh = get_unused_buffer_head(async);
		if (!bh)
			goto no_grow;

		bh->b_dev = B_FREE;  /* Flag as unused */
		bh->b_this_page = head;
		head = bh;

		bh->b_state = 0;
		bh->b_next_free = NULL;
		bh->b_pprev = NULL;
		atomic_set(&bh->b_count, 0);
		bh->b_size = size;

		set_bh_page(bh, page, offset);

		bh->b_list = BUF_CLEAN;
		bh->b_end_io = NULL;
	}
	return head;
/*
 * In case anything failed, we just free everything we got.
 */
no_grow:
	if (head) {
		spin_lock(&unused_list_lock);
		do {
			bh = head;
			head = head->b_this_page;
			__put_unused_buffer_head(bh);
		} while (head);
		spin_unlock(&unused_list_lock);

		/* Wake up any waiters ... */
		wake_up(&buffer_wait);
	}

	/*
	 * Return failure for non-async IO requests.  Async IO requests
	 * are not allowed to fail, so we have to wait until buffer heads
	 * become available.  But we don't want tasks sleeping with
	 * partially complete buffers, so all were released above.
	 */
	if (!async)
		return NULL;

	/* We're _really_ low on memory. Now we just
	 * wait for old buffer heads to become free due to
	 * finishing IO.  Since this is an async request and
	 * the reserve list is empty, we're sure there are
	 * async buffer heads in use.
	 */
	run_task_queue(&tq_disk);

	/*
	 * Set our state for sleeping, then check again for buffer heads.
	 * This ensures we won't miss a wake_up from an interrupt.
	 */
	wait_event(buffer_wait, nr_unused_buffer_heads >= MAX_BUF_PER_PAGE);
	goto try_again;
}
// fs/ext2/inode.c:493
/*
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
 * to tree, set linkage between the newborn blocks, write them if sync is
 * required, recheck the path, free and repeat if check fails, otherwise
 * set the last missing link (that will protect us from any truncate-generated
 * removals - all blocks on the path are immune now) and possibly force the
 * write on the parent block.
 * That has a nice additional property: no special recovery from the failed
 * allocations is needed - we simply release blocks and do not touch anything
 * reachable from inode.
 */

static int ext2_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create)
{
	int err = -EIO;
	int offsets[4];
	Indirect chain[4];
	Indirect *partial;
	unsigned long goal;
	int left;
	int depth = ext2_block_to_path(inode, iblock, offsets);

	if (depth == 0)
		goto out;

	lock_kernel();
reread:
	partial = ext2_get_branch(inode, depth, offsets, chain, &err);

	/* Simplest case - block found, no allocation needed */
	if (!partial) {
got_it:
		bh_result->b_dev = inode->i_dev;
		bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
		bh_result->b_state |= (1UL << BH_Mapped);
		/* Clean up and exit */
		partial = chain+depth-1; /* the whole chain */
		goto cleanup;
	}

	/* Next simple case - plain lookup or failed read of indirect block */
	if (!create || err == -EIO) {
cleanup:
		while (partial > chain) { // 最后一个不释放
			brelse(partial->bh);
			partial--;
		}
		unlock_kernel();
out:
		return err;
	}

	/*
	 * Indirect block might be removed by truncate while we were
	 * reading it. Handling of that case (forget what we've got and
	 * reread) is taken out of the main path.
	 */
	if (err == -EAGAIN)
		goto changed;

	if (ext2_find_goal(inode, iblock, chain, partial, &goal) < 0)
		goto changed;

	left = (chain + depth) - partial;
	err = ext2_alloc_branch(inode, left, goal,
					offsets+(partial-chain), partial);
	if (err)
		goto cleanup;

	if (ext2_splice_branch(inode, iblock, chain, partial, left) < 0)
		goto changed;

	bh_result->b_state |= (1UL << BH_New);
	goto got_it;

changed:
	while (partial > chain) {
		bforget(partial->bh);
		partial--;
	}
	goto reread;
}

ext2_block_to_path这个函数的逻辑就是准备好将逻辑块号转换成实际的块号的物理块号路径。这里需要了解ext2文件系统inode的i_block字段数组。

// fs/ext2/inode.c:144
/**
 *	ext2_block_to_path - parse the block number into array of offsets
 *	@inode: inode in question (we are only interested in its superblock)
 *	@i_block: block number to be parsed
 *	@offsets: array to store the offsets in
 *
 *	To store the locations of file's data ext2 uses a data structure common
 *	for UNIX filesystems - tree of pointers anchored in the inode, with
 *	data blocks at leaves and indirect blocks in intermediate nodes.
 *	This function translates the block number into path in that tree -
 *	return value is the path length and @offsets[n] is the offset of
 *	pointer to (n+1)th node in the nth one. If @block is out of range
 *	(negative or too large) warning is printed and zero returned.
 *
 *	Note: function doesn't find node addresses, so no IO is needed. All
 *	we need to know is the capacity of indirect blocks (taken from the
 *	inode->i_sb).
 */

/*
 * Portability note: the last comparison (check that we fit into triple
 * indirect block) is spelled differently, because otherwise on an
 * architecture with 32-bit longs and 8Kb pages we might get into trouble
 * if our filesystem had 8Kb blocks. We might use long long, but that would
 * kill us on x86. Oh, well, at least the sign propagation does not matter -
 * i_block would have to be negative in the very beginning, so we would not
 * get there at all.
 */

static int ext2_block_to_path(struct inode *inode, long i_block, int offsets[4])
{
	int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb);
	int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb);
	const long direct_blocks = EXT2_NDIR_BLOCKS,
		indirect_blocks = ptrs,
		double_blocks = (1 << (ptrs_bits * 2));
	int n = 0;

	if (i_block < 0) {
		ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0");
	} else if (i_block < direct_blocks) {
		offsets[n++] = i_block;
	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
		offsets[n++] = EXT2_IND_BLOCK;
		offsets[n++] = i_block;
	} else if ((i_block -= indirect_blocks) < double_blocks) {
		offsets[n++] = EXT2_DIND_BLOCK;
		offsets[n++] = i_block >> ptrs_bits;
		offsets[n++] = i_block & (ptrs - 1);
	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
		offsets[n++] = EXT2_TIND_BLOCK;
		offsets[n++] = i_block >> (ptrs_bits * 2);
		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
		offsets[n++] = i_block & (ptrs - 1);
	} else {
		ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big");
	}
	return n;
}
// fs/ext2/inode.c:125
typedef struct {
	u32	*p;
	u32	key;
	struct buffer_head *bh;
} Indirect;

static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
{
	p->key = *(p->p = v);
	p->bh = bh;
}

static inline int verify_chain(Indirect *from, Indirect *to)
{
	while (from <= to && from->key == *from->p)
		from++;
	return (from > to);
}


// fs/ext2/inode.c:204
/**
 *	ext2_get_branch - read the chain of indirect blocks leading to data
 *	@inode: inode in question
 *	@depth: depth of the chain (1 - direct pointer, etc.)
 *	@offsets: offsets of pointers in inode/indirect blocks
 *	@chain: place to store the result
 *	@err: here we store the error value
 *
 *	Function fills the array of triples  and returns %NULL
 *	if everything went OK or the pointer to the last filled triple
 *	(incomplete one) otherwise. Upon the return chain[i].key contains
 *	the number of (i+1)-th block in the chain (as it is stored in memory,
 *	i.e. little-endian 32-bit), chain[i].p contains the address of that
 *	number (it points into struct inode for i==0 and into the bh->b_data
 *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
 *	block for i>0 and NULL for i==0. In other words, it holds the block
 *	numbers of the chain, addresses they were taken from (and where we can
 *	verify that chain did not change) and buffer_heads hosting these
 *	numbers.
 *
 *	Function stops when it stumbles upon zero pointer (absent block)
 *		(pointer to last triple returned, *@err == 0)
 *	or when it gets an IO error reading an indirect block
 *		(ditto, *@err == -EIO)
 *	or when it notices that chain had been changed while it was reading
 *		(ditto, *@err == -EAGAIN)
 *	or when it reads all @depth-1 indirect blocks successfully and finds
 *	the whole chain, all way to the data (returns %NULL, *err == 0).
 */
static inline Indirect *ext2_get_branch(struct inode *inode,
					int depth,
					int *offsets,
					Indirect chain[4],
					int *err)
{
	kdev_t dev = inode->i_dev;
	int size = inode->i_sb->s_blocksize;
	Indirect *p = chain;
	struct buffer_head *bh;

	*err = 0;
	/* i_data is not going away, no lock needed */
	// ext2_i是ext2_inode_infoi_data结构
	// ext2_i.i_data是一个15个元素的数组,与ext2_inode.i_block一样
	add_chain (chain, NULL, inode->u.ext2_i.i_data + *offsets);
	if (!p->key)
		goto no_block;
	while (--depth) {
		bh = bread(dev, le32_to_cpu(p->key), size);
		if (!bh)
			goto failure;
		/* Reader: pointers */
		if (!verify_chain(chain, p))
			goto changed;
		add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
		/* Reader: end */
		if (!p->key)
			goto no_block;
	}
	return NULL;

changed:
	*err = -EAGAIN;
	goto no_block;
failure:
	*err = -EIO;
no_block:
	return p;
}
// include/linux/ext2_fs_i.h
/*
 * second extended file system inode data in memory
 */
struct ext2_inode_info {
    __u32   i_data[15];
    __u32   i_flags;
    __u32   i_faddr;
    __u8    i_frag_no;
    __u8    i_frag_size;
    __u16   i_osync;
    __u32   i_file_acl;
    __u32   i_dir_acl;
    __u32   i_dtime;
    __u32   not_used_1; /* FIX: not used/ 2.2 placeholder */
    __u32   i_block_group;
    __u32   i_next_alloc_block;
    __u32   i_next_alloc_goal;
    __u32   i_prealloc_block;
    __u32   i_prealloc_count;
    __u32   i_high_size;
    int i_new_inode:1;  /* Is a freshly allocated inode */
};
// fs/buffer.c:1169
/*
 * bread() reads a specified block and returns the buffer that contains
 * it. It returns NULL if the block was unreadable.
 */
struct buffer_head * bread(kdev_t dev, int block, int size)
{
	struct buffer_head * bh;

	bh = getblk(dev, block, size);
	if (buffer_uptodate(bh))
		return bh;
	ll_rw_block(READ, 1, &bh);
	wait_on_buffer(bh);
	if (buffer_uptodate(bh))
		return bh;
	brelse(bh);
	return NULL;
}
// fs/buffer.c:968
/*
 * Ok, this is getblk, and it isn't very clear, again to hinder
 * race-conditions. Most of the code is seldom used, (ie repeating),
 * so it should be much more efficient than it looks.
 *
 * The algorithm is changed: hopefully better, and an elusive bug removed.
 *
 * 14.02.92: changed it to sync dirty buffers a bit: better performance
 * when the filesystem starts to get full of dirty blocks (I hope).
 */
struct buffer_head * getblk(kdev_t dev, int block, int size)
{
	struct buffer_head * bh;
	int isize;

repeat:
	spin_lock(&lru_list_lock);
	write_lock(&hash_table_lock);
	bh = __get_hash_table(dev, block, size);
	if (bh)
		goto out;

	isize = BUFSIZE_INDEX(size);
	spin_lock(&free_list[isize].lock);
	bh = free_list[isize].list;
	if (bh) {
		__remove_from_free_list(bh, isize);
		atomic_set(&bh->b_count, 1);
	}
	spin_unlock(&free_list[isize].lock);

	/*
	 * OK, FINALLY we know that this buffer is the only one of
	 * its kind, we hold a reference (b_count>0), it is unlocked,
	 * and it is clean.
	 */
	if (bh) {
		init_buffer(bh, NULL, NULL);
		bh->b_dev = dev;
		bh->b_blocknr = block;
		bh->b_state = 1 << BH_Mapped;

		/* Insert the buffer into the regular lists */
		__insert_into_queues(bh);
	out:
		write_unlock(&hash_table_lock);
		spin_unlock(&lru_list_lock);
		touch_buffer(bh);
		return bh;
	}

	/*
	 * If we block while refilling the free list, somebody may
	 * create the buffer first ... search the hashes again.
	 */
	write_unlock(&hash_table_lock);
	spin_unlock(&lru_list_lock);
	refill_freelist(size);
	goto repeat;
}

关于挂载点

这是Linux 2.4.0 的vfsmount定义

关于mnt_clash有两种情况

  1. 比如将/dev/sdb1/dev/sdb2分两次挂载到/mnt上,则mntdentry.mnt_clash就有/dev/sdb1/dev/sdb2的两个挂载点信息。
  2. 比如有/mnt/a/mnt/b两个空目录,有/dev/sdb1/x空目录。现在将/dev/sdb1分别挂载到/mnt/a/mnt/b,这是允许的,则/mnt/a/x/mnt/b/x都能访问到/dev/sdb1。然后又有/dev/sdc1/dev/sdd1两个设备,将/dev/sdc1挂载到/mnt/a/x,将/dev/sdd1挂载到/mnt/b/x,这时xdentry.mnt_clash就有这两个设备的vfsmount。那就有一个疑问,内核在访问的时候到底该选dentry.mnt_clash下的哪一个vfsmount呢?答案是根据这两个vfsmount.mnt_parent来判断。因为内核解析从/经过mnt再经过ab,经过ab都各有一个不同的vfsmount,那么看xvfsmount.mnt_parent与哪个相等就是从哪个路径访问的,也就自然能找到对应的设备。

include/linux/mount.h:17

struct vfsmount
{
	struct dentry *mnt_mountpoint;	/* dentry of mountpoint */
	struct dentry *mnt_root;	/* root of the mounted tree */
	struct vfsmount *mnt_parent;	/* fs we are mounted on */
	struct list_head mnt_instances;	/* other vfsmounts of the same fs */
	struct list_head mnt_clash;	/* those who are mounted on (other */
					/* instances) of the same dentry */
	struct super_block *mnt_sb;	/* pointer to superblock */
	struct list_head mnt_mounts;	/* list of children, anchored here */
	struct list_head mnt_child;	/* and going through their mnt_child */
	atomic_t mnt_count;
	int mnt_flags;
	char *mnt_devname;		/* Name of device e.g. /dev/dsk/hda1 */
	struct list_head mnt_list;
	uid_t mnt_owner;
};

在2.4.0的内核中dentry有d_vfsmnt,用来链接挂载在该处的vfsmount,当内核解析路径的时候可以通过这个字段是否为空来轻松判断出该dentry是否为挂载点,但是在3.10的内核里这个字段已经被移除了,具体哪个版本移除的不知。这个字段移除后,内核又是如何判断一个dentry是否是挂载的呢?答案是通过对d_flags添加了一个DCACHE_MOUNTED(0x00010000)标志位来表示的

include/linux/dcache.h:59

struct dentry {
	atomic_t d_count;
	unsigned int d_flags;
	struct inode  * d_inode;	/* Where the name belongs to - NULL is negative */
	struct dentry * d_parent;	/* parent directory */
	struct list_head d_vfsmnt;
	struct list_head d_hash;	/* lookup hash list */
	struct list_head d_lru;		/* d_count = 0 LRU list */
	struct list_head d_child;	/* child of parent list */
	struct list_head d_subdirs;	/* our children */
	struct list_head d_alias;	/* inode alias list */
	struct qstr d_name;
	unsigned long d_time;		/* used by d_revalidate */
	struct dentry_operations  *d_op;
	struct super_block * d_sb;	/* The root of the dentry tree */
	unsigned long d_reftime;	/* last time referenced */
	void * d_fsdata;		/* fs-specific data */
	unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
};