Linux内核VFS层和具体文件系统层的inode的关系

以ext2为例，ext2 inode 与 vfs inode 的关系，在Linux 2.4.0 里的定义如下

struct inode {
    struct list_head    i_hash;
    struct list_head    i_list;
    struct list_head    i_dentry;

    struct list_head    i_dirty_buffers;

    unsigned long       i_ino;
    atomic_t        i_count;
    kdev_t          i_dev;
    umode_t         i_mode;
    nlink_t         i_nlink;
    uid_t           i_uid;
    gid_t           i_gid;
    kdev_t          i_rdev;
    loff_t          i_size;
    time_t          i_atime;
    time_t          i_mtime;
    time_t          i_ctime;
    unsigned long       i_blksize;
    unsigned long       i_blocks;
    unsigned long       i_version;
    struct semaphore    i_sem;
    struct semaphore    i_zombie;
    struct inode_operations *i_op;
    struct file_operations  *i_fop; /* former ->i_op->default_file_ops */
    struct super_block  *i_sb;
    wait_queue_head_t   i_wait;

    // ...

    union {
        struct minix_inode_info     minix_i;
        struct ext2_inode_info      ext2_i;
        // ...
        void                *generic_ip;
    } u;
};

受此定义影响，一直以为新版本(linux-6.6.27)也类似，是struct inode包含struct ext2_inode_info的，或者通过什么指针指向这个结构。其实不是。

首先明确：是struct ext2_inode_info包含struct inode的关系

定义如下：

// fs/ext2/ext2.h

/*
 * second extended file system inode data in memory
 */
struct ext2_inode_info {
        __le32  i_data[15];
        __u32   i_flags;
        __u32   i_faddr;
        __u8    i_frag_no;
        __u8    i_frag_size;
        __u16   i_state;
        __u32   i_file_acl;
        __u32   i_dir_acl;
        __u32   i_dtime;

        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
        __u32   i_block_group;

        /* block reservation info */
        struct ext2_block_alloc_info *i_block_alloc_info;

        __u32   i_dir_start_lookup;
#ifdef CONFIG_EXT2_FS_XATTR
        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_mutex even when reading would cause contention
         * between readers of EAs and writers of regular file data, so
         * instead we synchronize on xattr_sem when reading or changing
         * EAs.
         */
        struct rw_semaphore xattr_sem;
#endif
        rwlock_t i_meta_lock;

        /*
         * truncate_mutex is for serialising ext2_truncate() against
         * ext2_getblock().  It also protects the internals of the inode's
         * reservation data structures: ext2_reserve_window and
         * ext2_reserve_window_node.
         */
        struct mutex truncate_mutex;
        struct inode    vfs_inode;
        struct list_head i_orphan;      /* unlinked but open inodes */
#ifdef CONFIG_QUOTA
        struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif
};

可以看到在struct ext2_inode_info中有一个struct inode的成员vfs_inode。在VFS层就是使用的这个成员。这样定义的好处分别对比两种情况来说：

第一种，像Linux 2.4.0内核那样将所有文件系统的inode都定义到VFS层struct inode的一个union里。但这样就必然有的文件系统占用很多字节，而有的文件系统只需要很少的字节空间，而对于只需要很少字节空间的文件系统来说就浪费了内存。

第二种，在struct inode定义一个指针指向特定文件系统的inode，例如对ext2来说就是struct ext2_inode_info。但这样在分配inode的时候就必然会涉及两次内存分配，比较麻烦。

所以现在这种方式，比较好。

ext2文件系统中分配inode的方法如下，它会调用到new_inode，并将该文件系统的super_block作为参数传递进去。

// fs/ext2/ialloc.c
struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
                             const struct qstr *qstr)
{
        struct super_block *sb;
        struct buffer_head *bitmap_bh = NULL;
        struct buffer_head *bh2;
        int group, i;
        ino_t ino = 0;
        struct inode * inode;
        struct ext2_group_desc *gdp;
        struct ext2_super_block *es;
        struct ext2_inode_info *ei;
        struct ext2_sb_info *sbi;
        int err;

        sb = dir->i_sb;
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);

        ei = EXT2_I(inode);
        sbi = EXT2_SB(sb);
        es = sbi->s_es;

        // ...
        mark_inode_dirty(inode);
        ext2_debug("allocating inode %lu\n", inode->i_ino);
        ext2_preread_inode(inode);
        return inode;

fail_free_drop:
        dquot_free_inode(inode);

fail_drop:
        dquot_drop(inode);
        inode->i_flags |= S_NOQUOTA;
        clear_nlink(inode);
        discard_new_inode(inode);
        return ERR_PTR(err);

fail:
        make_bad_inode(inode);
        iput(inode);
        return ERR_PTR(err);
}

再跟下去

// fs/inode.c
struct inode *new_inode(struct super_block *sb)
{
        struct inode *inode;

        inode = new_inode_pseudo(sb);
        if (inode)
                inode_sb_list_add(inode);
        return inode;
}

// fs/inode.c
struct inode *new_inode_pseudo(struct super_block *sb)
{
        struct inode *inode = alloc_inode(sb);

        if (inode) {
                spin_lock(&inode->i_lock);
                inode->i_state = 0;
                spin_unlock(&inode->i_lock);
        }
        return inode;
}

// fs/inode.c
static struct inode *alloc_inode(struct super_block *sb)
{
        const struct super_operations *ops = sb->s_op;
        struct inode *inode;

        if (ops->alloc_inode)
                inode = ops->alloc_inode(sb);
        else
                inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);

        if (!inode)
                return NULL;

        if (unlikely(inode_init_always(sb, inode))) {
                if (ops->destroy_inode) {
                        ops->destroy_inode(inode);
                        if (!ops->free_inode)
                                return NULL;
                }
                inode->free_inode = ops->free_inode;
                i_callback(&inode->i_rcu);
                return NULL;
        }

        return inode;
}

最终会来到alloc_inode，该函数会先判断super_block的super_operations是否有alloc_inode成员函数指针，如果有就调用该函数分配。

对于ext2来说，就是ext2_alloc_inode,它是在ext2_inode_cachep这个kmem cache上分配。

// fs/ext2/super.c
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
        struct ext2_inode_info *ei;
        ei = alloc_inode_sb(sb, ext2_inode_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
        ei->i_block_alloc_info = NULL;
        inode_set_iversion(&ei->vfs_inode, 1);
#ifdef CONFIG_QUOTA
        memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
#endif

        return &ei->vfs_inode;
}

// ...

static const struct super_operations ext2_sops = {
        .alloc_inode    = ext2_alloc_inode,

        // ...
};

super_block的super_operations定义如下

// include/linux/fs.h

struct super_block {
        // ...
        const struct super_operations   *s_op;
        // ...
};

struct super_operations {
        struct inode *(*alloc_inode)(struct super_block *sb);
        // ...
};

需要注意的是new_inode返回的是struct inode *类型，在ext2层还不能直接用，还需要用EXT2_I转换成struct ext2_inode_info *。转换方法就是非常常见的container_of。

// fs/ext2/ext2.h
static inline struct ext2_inode_info *EXT2_I(struct inode *inode)
{
        return container_of(inode, struct ext2_inode_info, vfs_inode);
}

container_of定义为

// include/linux/container_of.h
#define container_of(ptr, type, member) ({                              \
        void *__mptr = (void *)(ptr);                                   \
        static_assert(__same_type(*(ptr), ((type *)0)->member) ||       \
                      __same_type(*(ptr), void),                        \
                      "pointer type mismatch in container_of()");       \
        ((type *)(__mptr - offsetof(type, member))); })

在这种情况下就展开为

({
    void *__mptr = (void *)(inode);
    ((struct ext2_inode_info *)(__mptr - offsetof(struct ext2_inode_info, vfs_inode)));
})

dentry则貌似不存在这种关系

// include/linux/dcache.h
struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;           /* protected by d_lock */
        seqcount_spinlock_t d_seq;      /* per dentry seqlock */
        struct hlist_bl_node d_hash;    /* lookup hash list */
        struct dentry *d_parent;        /* parent directory */
        struct qstr d_name;
        struct inode *d_inode;          /* Where the name belongs to - NULL is
                                         * negative */
        unsigned char d_iname[DNAME_INLINE_LEN];        /* small names */

        /* Ref lookup also touches following */
        struct lockref d_lockref;       /* per-dentry lock and refcount */
        const struct dentry_operations *d_op;
        struct super_block *d_sb;       /* The root of the dentry tree */
        unsigned long d_time;           /* used by d_revalidate */
        void *d_fsdata;                 /* fs-specific data */

        union {
                struct list_head d_lru;         /* LRU list */
                wait_queue_head_t *d_wait;      /* in-lookup ones only */
        };
        struct list_head d_child;       /* child of parent list */
        struct list_head d_subdirs;     /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;      /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;  /* only for in-lookup ones */
                struct rcu_head d_rcu;
        } d_u;
} __randomize_layout;

// fs/ext2/ext2.h
struct ext2_dir_entry_2 {
        __le32  inode;                  /* Inode number */
        __le16  rec_len;                /* Directory entry length */
        __u8    name_len;               /* Name length */
        __u8    file_type;
        char    name[];                 /* File name, up to EXT2_NAME_LEN */
};

struct dentry中的d_fsdata在ext2中也不是用来指向struct ext2_dir_entry_2的,在ext2中就没有使用这个字段。