ZYB ARTICLES REPOS

操作系统实验三

基于 Linux 6.6.27

为什么我的initramfs里明明没有 /dev /root两个目录 和/dev/console的文件,而在启动init脚本后,会出现这些文件夹和文件。

所以写个C程序,替换掉init脚本,这个程序的目的的就是直接列出 / 目录下所有文件名

#include 
#include 

int main() {
    DIR *dir;
    struct dirent *entry;

    dir = opendir("/");
    if (dir == NULL) {
        perror("Error opening root directory");
        return 1;
    }

    while ((entry = readdir(dir)) != NULL) {
        printf("%s\n", entry->d_name);
    }

    closedir(dir);
    return 0;
}

需要静态编译 不然的话会:

[    1.910634][    T1] Run /init as init process
[    1.912440][    T1] Failed to execute /init (error -2)

经过测试,列出如下

.
..
dev
root
bin
init

这说明,并不是因为加载脚本默认创建的,而是在内核一早就准备好了的。

那是在什么时候创建的呢?

接下来,在内核代码init/initramfs.c里添加这段代码

static bool print_file_name(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type)
{
    printk(" rootfs / file: %.*s\n", namlen, name);
    return true;
}


static void print_rootfs_files(const char *rootfs_path, const char *func, int line)
{
    struct file *file;
    struct dir_context ctx = {
        .actor = print_file_name,
    };


    printk("%s:%s:%d", __func__, func, line);

    file = filp_open(rootfs_path, O_RDONLY | O_DIRECTORY, 0);
    if (IS_ERR(file)) {
        pr_err("Failed to open rootfs directory: %ld\n", PTR_ERR(file));
        return;
    }

    iterate_dir(file, &ctx);

    filp_close(file, NULL);
}

并在unpack_to_rootfs之后调用 得到结果

[    0.832556][   T29] print_rootfs_files:do_populate_rootfs:724
[    0.832556][   T29]  rootfs / file: .
[    0.832556][   T29]  rootfs / file: ..
[    0.836995][   T29] print_rootfs_files:do_populate_rootfs:726
[    0.837040][   T29]  rootfs / file: .
[    0.837185][   T29]  rootfs / file: ..
[    0.837307][   T29]  rootfs / file: dev
[    0.837412][   T29]  rootfs / file: root
[    0.837545][   T29] Unpacking initramfs...
[    0.845110][   T29] unpack_to_rootfs filp open bin/busybox
[    0.918683][   T29] unpack_to_rootfs filp open init
[    0.935362][   T29] print_rootfs_files:do_populate_rootfs:739
[    0.935439][   T29]  rootfs / file: .
[    0.935592][   T29]  rootfs / file: ..
[    0.935592][   T29]  rootfs / file: dev
[    0.935592][   T29]  rootfs / file: root
[    0.935592][   T29]  rootfs / file: bin
[    0.935592][   T29]  rootfs / file: init

可以看出在do_populate_rootfs第一次调用unpack_to_rootfs前就没有 /dev 和 /root 但是调用之后就已经创建好了 /dev 和 /root

static void __init do_populate_rootfs(void *unused, async_cookie_t cookie)
{
        /* Load the built in initramfs */
        char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
        if (err)
                panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */

        if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE))
                goto done;

        if (IS_ENABLED(CONFIG_BLK_DEV_RAM))
                printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n");
        else
                printk(KERN_INFO "Unpacking initramfs...\n");

        err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start);
        if (err) {
#ifdef CONFIG_BLK_DEV_RAM
                populate_initrd_image(err);
#else
                printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err);
#endif
        }

done:
        /*
         * If the initrd region is overlapped with crashkernel reserved region,
         * free only memory that is not part of crashkernel region.
         */
        if (!do_retain_initrd && initrd_start && !kexec_free_initrd())
                free_initrd_mem(initrd_start, initrd_end);
        initrd_start = 0;
        initrd_end = 0;

        flush_delayed_fput();
        task_work_run();
}

可以看到这两次unpack_to_rootfs是有明显的差异的

一次是:unpack_to_rootfs(__initramfs_start, __initramfs_size); 这行代码处理的是内嵌在内核映像中的 initramfs。在编译内核时,可以选择将 initramfs 直接编译进内核映像中。__initramfs_start 和 __initramfs_size 是由编译器在链接阶段设置的,它们分别表示内嵌 initramfs 的开始地址和大小。 内嵌 initramfs 的生成过程是通过内核的配置和编译系统(Kconfig 和 Makefile)完成的。具体来说,在内核配置文件(通常是 .config 文件)中,有一个选项叫做 CONFIG_INITRAMFS_SOURCE,它指定了用于生成内嵌 initramfs 的源文件或目录的路径。

# usr/Makefile
ramfs-input := $(CONFIG_INITRAMFS_SOURCE)
cpio-data :=

# If CONFIG_INITRAMFS_SOURCE is empty, generate a small initramfs with the
# default contents.
ifeq ($(ramfs-input),)
ramfs-input := $(srctree)/$(src)/default_cpio_list
endif

usr/default_cpio_list的内容为

# SPDX-License-Identifier: GPL-2.0-only
# This is a very simple, default initramfs

dir /dev 0755 0 0
nod /dev/console 0600 0 0 c 5 1
dir /root 0700 0 0

一次是:unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); 这行代码处理的是传递给内核的旧式 initrd 映像。initrd 是一种旧式的 RAM 磁盘技术,它和 initramfs 功能类似,但是使用方式不同。initrd 映像是一个单独的文件,需要在引导加载器(如 GRUB)中显式地传递给内核。initrd_start 和 initrd_end 由引导加载器设置,它们表示 initrd 映像在内存中的开始和结束地址。

因此 /dev /root /dev/console 不是initramfs.cpio.gz里带的,也不是内核创建的。而是和内核嵌在一起的initramfs里的内容。

修改usr/default_cpio_list的内容,重新编译启动内核就能轻松验证。

如果在启动内核的时候没有指定initramfs.cpio.gz或,里面没有init程序,或参数rdinit指定的程序不存在,内核都会自己尝试挂载根文件系统【但要指定root参数】

如果不使用initramfs,让内核自己挂载根文件系统,init进程不能再用rdinit参数指定,默认用 /sbin/init /etc/init /bin/init /bin/sh等

下面来看内核自己挂载硬盘(以/dev/sda1为例)为根目录的情况【可以删除initramfs.cpio.gz里的init,或不指定initramfs.cpio.gz来触发】

prepare_namespace调用mount_root将/dev/sda1挂载到/root目录下 devtmpfs_mount将devtmpfs挂载到/dev目录下 init_mount将当前进程的根目录更改为挂载的根文件系统,在这个调用中,. 表示当前进程的根目录,/ 表示要更改为的根目录,MS_MOVE 是挂载标志,表示将当前进程的根目录移动到新的根目录。 init_chroot将当前进程的工作目录更改为挂载的根文件系统。这个函数调用实际上是将当前进程的工作目录更改为其根目录,这样内核就可以访问挂载的根文件系统中的文件。

// init/do_mounts.c
/*
 * Prepare the namespace - decide what/where to mount, load ramdisks, etc.
 */
void __init prepare_namespace(void)
{
        if (root_delay) {
                printk(KERN_INFO "Waiting %d sec before mounting root device...\n",
                       root_delay);
                ssleep(root_delay);
        }

        /*
         * wait for the known devices to complete their probing
         *
         * Note: this is a potential source of long boot delays.
         * For example, it is not atypical to wait 5 seconds here
         * for the touchpad of a laptop to initialize.
         */
        wait_for_device_probe();

        md_run_setup();

        if (saved_root_name[0])
                ROOT_DEV = parse_root_device(saved_root_name);

        if (initrd_load(saved_root_name))
                goto out;

        if (root_wait)
                wait_for_root(saved_root_name);
        mount_root(saved_root_name);
out:
        devtmpfs_mount();
        init_mount(".", "/", NULL, MS_MOVE, NULL);
        init_chroot(".");
}

init_mount(".", "/", NULL, MS_MOVE, NULL);将进程的当前目录挂载为’/‘根目录后。根目录就是硬盘的目录了。 但在init_mount之后调用print_rootfs_files会发现还是rootfs的目录 /dev/dev/root/dev/console/root 在init_chroot之后调用print_rootfs_files才是硬盘目录

void __init mount_root(char *root_device_name)
{
        switch (ROOT_DEV) {
        case Root_NFS:
                mount_nfs_root();
                break;
        case Root_CIFS:
                mount_cifs_root();
                break;
        case Root_Generic:
                mount_root_generic(root_device_name, root_device_name,
                                   root_mountflags);
                break;
        case 0:
                if (root_device_name && root_fs_names &&
                    mount_nodev_root(root_device_name) == 0)
                        break;
                fallthrough;
        default:
                mount_block_root(root_device_name);
                break;
        }
}

在mount_block_root里,在rootfs的/dev里创建一个root设备节点

// init/do_mounts.c
static void __init mount_block_root(char *root_device_name)
{
        int err = create_dev("/dev/root", ROOT_DEV);

        if (err < 0)
                pr_emerg("Failed to create /dev/root: %d\n", err);
        mount_root_generic("/dev/root", root_device_name, root_mountflags);
}

然后使用mount_root_generic将/dev/root挂载到rootfs上

由于此处只有设备节点,但是没有文件系统信息,所以mount_root_generic会遍历内核所有的文件系统名,依次来尝试用do_mount_root挂载/dev/root到rootfs的/root目录。

void __init mount_root_generic(char *name, char *pretty_name, int flags)
{
        struct page *page = alloc_page(GFP_KERNEL);
        char *fs_names = page_address(page);
        char *p;
        char b[BDEVNAME_SIZE];
        int num_fs, i;

        scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)",
                  MAJOR(ROOT_DEV), MINOR(ROOT_DEV));
        if (root_fs_names)
                num_fs = split_fs_names(fs_names, PAGE_SIZE);
        else
                num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE);
retry:
        for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) {
                int err;

                if (!*p)
                        continue;

                err = do_mount_root(name, p, flags, root_mount_data);
                switch (err) {
                        case 0:
                                goto out;
                        case -EACCES:
                        case -EINVAL:
                                continue;
                }
                /*
                 * Allow the user to distinguish between failed sys_open
                 * and bad superblock on root device.
                 * and give them a list of the available devices
                 */
                printk("VFS: Cannot open root device \"%s\" or %s: error %d\n",
                                pretty_name, b, err);
                printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");
                printk_all_partitions();

                if (root_fs_names)
                        num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE);
                if (!num_fs)
                        pr_err("Can't find any bdev filesystem to be used for mount!\n");
                else {
                        pr_err("List of all bdev filesystems:\n");
                        for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
                                pr_err(" %s", p);
                        pr_err("\n");
                }

                panic("VFS: Unable to mount root fs on %s", b);
        }
        if (!(flags & SB_RDONLY)) {
                flags |= SB_RDONLY;
                goto retry;
        }

        printk("List of all partitions:\n");
        printk_all_partitions();
        printk("No filesystem could mount root, tried: ");
        for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
                printk(" %s", p);
        printk("\n");
        panic("VFS: Unable to mount root fs on %s", b);
out:
        put_page(page);
}

为什么是挂载到/root目录呢,可以在do_mount_root逻辑里看到init_mount(name, "/root", fs, flags, data_page);init_mount("/dev/root", "/root", fs. flags. data_page)

这个/root目录可以换成别的目录吗? 当然可以,假设为/rootA。 先修改usr/default_cpio_list,dir /rootA 0700 0 0。然后修改do_mount_root函数里的init_mount和init_chdir里的/root为/rootA

static int __init do_mount_root(const char *name, const char *fs,
                                 const int flags, const void *data)
{
        struct super_block *s;
        struct page *p = NULL;
        char *data_page = NULL;
        int ret;

        if (data) {
                /* init_mount() requires a full page as fifth argument */
                p = alloc_page(GFP_KERNEL);
                if (!p)
                        return -ENOMEM;
                data_page = page_address(p);
                /* zero-pad. init_mount() will make sure it's terminated */
                strncpy(data_page, data, PAGE_SIZE);
        }


        ret = init_mount(name, "/root", fs, flags, data_page);
        if (ret)
                goto out;

        init_chdir("/root");
        s = current->fs->pwd.dentry->d_sb;
        ROOT_DEV = s->s_dev;
        printk(KERN_INFO
               "VFS: Mounted root (%s filesystem)%s on device %u:%u.\n",
               s->s_type->name,
               sb_rdonly(s) ? " readonly" : "",
               MAJOR(ROOT_DEV), MINOR(ROOT_DEV));

out:
        if (p)
                put_page(p);
        return ret;
}

补充

// fs/namespace.c
void __init mnt_init(void)
{
        int err;

        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        mount_hashtable = alloc_large_system_hash("Mount-cache",
                                sizeof(struct hlist_head),
                                mhash_entries, 19,
                                HASH_ZERO,
                                &m_hash_shift, &m_hash_mask, 0, 0);
        mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
                                sizeof(struct hlist_head),
                                mphash_entries, 19,
                                HASH_ZERO,
                                &mp_hash_shift, &mp_hash_mask, 0, 0);

        if (!mount_hashtable || !mountpoint_hashtable)
                panic("Failed to allocate mount hash table\n");

        kernfs_init();

        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
                        __func__, err);
        fs_kobj = kobject_create_and_add("fs", NULL);
        if (!fs_kobj)
                printk(KERN_WARNING "%s: kobj create error\n", __func__);
        shmem_init();
        init_rootfs();
        init_mount_tree();
}

// init/do_mounts.c
void __init init_rootfs(void)
{
        if (IS_ENABLED(CONFIG_TMPFS)) {
                if (!saved_root_name[0] && !root_fs_names)
                {
                        is_tmpfs = true;
                }
                else if (root_fs_names && !!strstr(root_fs_names, "tmpfs"))
                {
                        is_tmpfs = true;
                }
        }
}
// fs/namespace.c
struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                int flags, const char *name,
                                void *data)
{
        struct fs_context *fc;
        struct vfsmount *mnt;
        int ret = 0;

        if (!type)
                return ERR_PTR(-EINVAL);

        fc = fs_context_for_mount(type, flags);
        if (IS_ERR(fc))
                return ERR_CAST(fc);

        if (name)
                ret = vfs_parse_fs_string(fc, "source",
                                          name, strlen(name));
        if (!ret)
                ret = parse_monolithic_mount_data(fc, data);
        if (!ret)
                mnt = fc_mount(fc);
        else
                mnt = ERR_PTR(ret);

        put_fs_context(fc);
        return mnt;
}

static void __init init_mount_tree(void)
{
        struct vfsmount *mnt;
        struct mount *m;
        struct mnt_namespace *ns;
        struct path root;

        mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");

        ns = alloc_mnt_ns(&init_user_ns, false);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
        m = real_mount(mnt);
        m->mnt_ns = ns;
        ns->root = m;
        ns->mounts = 1;
        list_add(&m->mnt_list, &ns->list);
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);

        root.mnt = mnt;
        root.dentry = mnt->mnt_root;
        mnt->mnt_flags |= MNT_LOCKED;

        set_fs_pwd(current->fs, &root);
        set_fs_root(current->fs, &root);
}
void __init vfs_caches_init(void)
{
        // ...
        mnt_init();
        // ...
}
void start_kernel(void)
{
        // ...
        vfs_caches_init();
        // ...
        arch_call_rest_init();
}


noinline void __ref __noreturn rest_init(void)
{

        // ...
        pid = user_mode_thread(kernel_init, NULL, CLONE_FS);
        // ...
}

void __init __weak __noreturn arch_call_rest_init(void)
{
        rest_init();
}
static noinline void __init kernel_init_freeable(void)
{

        // ...

        wait_for_initramfs();
        console_on_rootfs();

        /*
         * check if there is an early userspace init.  If yes, let it do all
         * the work
         */
        if (init_eaccess(ramdisk_execute_command) != 0) {
                ramdisk_execute_command = NULL;
                prepare_namespace();
        }
        // ...
}

static int __ref kernel_init(void *unused)
{
        int ret;

        /*
         * Wait until kthreadd is all set-up.
         */
        wait_for_completion(&kthreadd_done);

        kernel_init_freeable();
        /* need to finish all async __init code before freeing the memory */
        async_synchronize_full();

        system_state = SYSTEM_FREEING_INITMEM;
        kprobe_free_init_mem();
        ftrace_free_init_mem();
        kgdb_free_init_mem();
        exit_boot_config();
        free_initmem();
        mark_readonly();

        /*
         * Kernel mappings are now finalized - update the userspace page-table
         * to finalize PTI.
         */
        pti_finalize();

        system_state = SYSTEM_RUNNING;
        numa_default_policy();

        rcu_end_inkernel_boot();

        do_sysctl_args();

        if (ramdisk_execute_command) {
                ret = run_init_process(ramdisk_execute_command);
                if (!ret)
                        return 0;
                pr_err("Failed to execute %s (error %d)\n",
                       ramdisk_execute_command, ret);
        }

        /*
         * We try each of these until one succeeds.
         *
         * The Bourne shell can be used instead of init if we are
         * trying to recover a really broken machine.
         */
        if (execute_command) {
                ret = run_init_process(execute_command);
                if (!ret)
                        return 0;
                panic("Requested init %s failed (error %d).",
                      execute_command, ret);
        }

        if (CONFIG_DEFAULT_INIT[0] != '\0') {
                ret = run_init_process(CONFIG_DEFAULT_INIT);
                if (ret)
                        pr_err("Default init %s failed (error %d)\n",
                               CONFIG_DEFAULT_INIT, ret);
                else
                        return 0;
        }

        if (!try_to_run_init_process("/sbin/init") ||
            !try_to_run_init_process("/etc/init") ||
            !try_to_run_init_process("/bin/init") ||
            !try_to_run_init_process("/bin/sh"))
                return 0;

        panic("No working init found.  Try passing init= option to kernel. "
              "See Linux Documentation/admin-guide/init.rst for guidance.");
}
start_kernel
        -> vfs_caches_init
                -> mnt_init
                        -> init_rootfs
                        -> init_mount_tree
                                -> vfs_kern_mount 【初始化rootfs:mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);】
                                -> 在内核挂载了rootfs后,会将当前进程的pwd和root目录设置为rootfs的根目录。
        -> arch_call_rest_init -> rest_init -> user_mode_thread:kernel_init
                -> prepare_namespace
                        -> mount_root
                                -> mount_block_root 【创建/dev/root】
                                        -> mount_root_generic 【将/dev/root并挂载到/root】
                                        -> do_mount_root 【根据内核所有fs的名字试着挂载/dev/root到/root】
                                                -> init_chdir 【使用init_chdir将进程的当前目录设置为/root】
                        -> devtmpfs_mount
                        -> init_mount
                        -> init_chroot