Linux内核VFS层和具体文件系统层的inode的关系
以ext2为例,ext2 inode 与 vfs inode 的关系,在Linux 2.4.0 里的定义如下
struct inode {
struct list_head i_hash;
struct list_head i_list;
struct list_head i_dentry;
struct list_head i_dirty_buffers;
unsigned long i_ino;
atomic_t i_count;
kdev_t i_dev;
umode_t i_mode;
nlink_t i_nlink;
uid_t i_uid;
gid_t i_gid;
kdev_t i_rdev;
loff_t i_size;
time_t i_atime;
time_t i_mtime;
time_t i_ctime;
unsigned long i_blksize;
unsigned long i_blocks;
unsigned long i_version;
struct semaphore i_sem;
struct semaphore i_zombie;
struct inode_operations *i_op;
struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct super_block *i_sb;
wait_queue_head_t i_wait;
// ...
union {
struct minix_inode_info minix_i;
struct ext2_inode_info ext2_i;
// ...
void *generic_ip;
} u;
};
受此定义影响,一直以为新版本(linux-6.6.27)也类似,是struct inode
包含struct ext2_inode_info
的,或者通过什么指针指向这个结构。其实不是。
首先明确:是struct ext2_inode_info
包含struct inode
的关系
定义如下:
// fs/ext2/ext2.h
/*
* second extended file system inode data in memory
*/
struct ext2_inode_info {
__le32 i_data[15];
__u32 i_flags;
__u32 i_faddr;
__u8 i_frag_no;
__u8 i_frag_size;
__u16 i_state;
__u32 i_file_acl;
__u32 i_dir_acl;
__u32 i_dtime;
/*
* i_block_group is the number of the block group which contains
* this file's inode. Constant across the lifetime of the inode,
* it is used for making block allocation decisions - we try to
* place a file's data blocks near its inode block, and new inodes
* near to their parent directory's inode.
*/
__u32 i_block_group;
/* block reservation info */
struct ext2_block_alloc_info *i_block_alloc_info;
__u32 i_dir_start_lookup;
#ifdef CONFIG_EXT2_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_mutex even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
*/
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
/*
* truncate_mutex is for serialising ext2_truncate() against
* ext2_getblock(). It also protects the internals of the inode's
* reservation data structures: ext2_reserve_window and
* ext2_reserve_window_node.
*/
struct mutex truncate_mutex;
struct inode vfs_inode;
struct list_head i_orphan; /* unlinked but open inodes */
#ifdef CONFIG_QUOTA
struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
};
可以看到在struct ext2_inode_info
中有一个struct inode
的成员vfs_inode
。 在VFS层就是使用的这个成员。这样定义的好处分别对比两种情况来说:
第一种,像Linux 2.4.0内核那样将所有文件系统的inode都定义到VFS层struct inode
的一个union
里。但这样就必然有的文件系统占用很多字节,而有的文件系统只需要很少的字节空间,而对于只需要很少字节空间的文件系统来说就浪费了内存。
第二种,在struct inode
定义一个指针指向特定文件系统的inode
,例如对ext2
来说就是struct ext2_inode_info
。但这样在分配inode的时候就必然会涉及两次内存分配,比较麻烦。
所以现在这种方式,比较好。
ext2文件系统中分配inode
的方法如下,它会调用到new_inode
,并将该文件系统的super_block
作为参数传递进去。
// fs/ext2/ialloc.c
struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
const struct qstr *qstr)
{
struct super_block *sb;
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *bh2;
int group, i;
ino_t ino = 0;
struct inode * inode;
struct ext2_group_desc *gdp;
struct ext2_super_block *es;
struct ext2_inode_info *ei;
struct ext2_sb_info *sbi;
int err;
sb = dir->i_sb;
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
ei = EXT2_I(inode);
sbi = EXT2_SB(sb);
es = sbi->s_es;
// ...
mark_inode_dirty(inode);
ext2_debug("allocating inode %lu\n", inode->i_ino);
ext2_preread_inode(inode);
return inode;
fail_free_drop:
dquot_free_inode(inode);
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
clear_nlink(inode);
discard_new_inode(inode);
return ERR_PTR(err);
fail:
make_bad_inode(inode);
iput(inode);
return ERR_PTR(err);
}
再跟下去
// fs/inode.c
struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
inode = new_inode_pseudo(sb);
if (inode)
inode_sb_list_add(inode);
return inode;
}
// fs/inode.c
struct inode *new_inode_pseudo(struct super_block *sb)
{
struct inode *inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
}
return inode;
}
// fs/inode.c
static struct inode *alloc_inode(struct super_block *sb)
{
const struct super_operations *ops = sb->s_op;
struct inode *inode;
if (ops->alloc_inode)
inode = ops->alloc_inode(sb);
else
inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
if (!inode)
return NULL;
if (unlikely(inode_init_always(sb, inode))) {
if (ops->destroy_inode) {
ops->destroy_inode(inode);
if (!ops->free_inode)
return NULL;
}
inode->free_inode = ops->free_inode;
i_callback(&inode->i_rcu);
return NULL;
}
return inode;
}
最终会来到alloc_inode
,该函数会先判断super_block
的super_operations
是否有alloc_inode
成员函数指针,如果有就调用该函数分配。
对于ext2来说,就是ext2_alloc_inode
,它是在ext2_inode_cachep
这个kmem cache上分配。
// fs/ext2/super.c
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
ei = alloc_inode_sb(sb, ext2_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->i_block_alloc_info = NULL;
inode_set_iversion(&ei->vfs_inode, 1);
#ifdef CONFIG_QUOTA
memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif
return &ei->vfs_inode;
}
// ...
static const struct super_operations ext2_sops = {
.alloc_inode = ext2_alloc_inode,
// ...
};
super_block的super_operations定义如下
// include/linux/fs.h
struct super_block {
// ...
const struct super_operations *s_op;
// ...
};
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb);
// ...
};
需要注意的是new_inode
返回的是struct inode *
类型,在ext2
层还不能直接用,还需要用EXT2_I
转换成struct ext2_inode_info *
。转换方法就是非常常见的container_of
。
// fs/ext2/ext2.h
static inline struct ext2_inode_info *EXT2_I(struct inode *inode)
{
return container_of(inode, struct ext2_inode_info, vfs_inode);
}
container_of
定义为
// include/linux/container_of.h
#define container_of(ptr, type, member) ({ \
void *__mptr = (void *)(ptr); \
static_assert(__same_type(*(ptr), ((type *)0)->member) || \
__same_type(*(ptr), void), \
"pointer type mismatch in container_of()"); \
((type *)(__mptr - offsetof(type, member))); })
在这种情况下就展开为
({
void *__mptr = (void *)(inode);
((struct ext2_inode_info *)(__mptr - offsetof(struct ext2_inode_info, vfs_inode)));
})
dentry
则貌似不存在这种关系
// include/linux/dcache.h
struct dentry {
/* RCU lookup touched fields */
unsigned int d_flags; /* protected by d_lock */
seqcount_spinlock_t d_seq; /* per dentry seqlock */
struct hlist_bl_node d_hash; /* lookup hash list */
struct dentry *d_parent; /* parent directory */
struct qstr d_name;
struct inode *d_inode; /* Where the name belongs to - NULL is
* negative */
unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
/* Ref lookup also touches following */
struct lockref d_lockref; /* per-dentry lock and refcount */
const struct dentry_operations *d_op;
struct super_block *d_sb; /* The root of the dentry tree */
unsigned long d_time; /* used by d_revalidate */
void *d_fsdata; /* fs-specific data */
union {
struct list_head d_lru; /* LRU list */
wait_queue_head_t *d_wait; /* in-lookup ones only */
};
struct list_head d_child; /* child of parent list */
struct list_head d_subdirs; /* our children */
/*
* d_alias and d_rcu can share memory
*/
union {
struct hlist_node d_alias; /* inode alias list */
struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */
struct rcu_head d_rcu;
} d_u;
} __randomize_layout;
// fs/ext2/ext2.h
struct ext2_dir_entry_2 {
__le32 inode; /* Inode number */
__le16 rec_len; /* Directory entry length */
__u8 name_len; /* Name length */
__u8 file_type;
char name[]; /* File name, up to EXT2_NAME_LEN */
};
struct dentry
中的d_fsdata
在ext2
中也不是用来指向struct ext2_dir_entry_2
的,在ext2
中就没有使用这个字段。