关于linux的一点好奇心（二）：linux启动过程之三大进程

2022-01-23 22:01:55 阅读：250 来源： 互联网

标签：之三 boot idle 好奇心 init void linux create cpu

　　上一节我们通过对x86的linux内核的讲解，知道了它的一个大概的启动过程。

        /arch/x86/boot/header.S
        -> calll main    ->    /arch/x86/boot/main.c
        -> go_to_protected_mode()    ->    /arch/x86/boot/pm.c
        -> protected_mode_jump()    ->    /arch/x86/boot/pmjump.S
        -> jmpl    *%eax    ->    /arch/x86/kernel/head_32.S
        -> .long i386_start_kernel    ->    /arch/x86/kernel/head32.c
        -> start_kernel()    ->    /init/main.c    (C语言入口)

　　这其中的动作，基本都是找到对应的地址，然后设置各种设备的初始化信息，中断设置，键盘，控制台，idt...

　　当然，有相当一部分代码是用汇编语言完成的，这自然是底层硬件决定的，而且因为特殊性，再封装是没有必要的了。所以，汇编是最好的选择。

　　本篇，我们再来看看cpu架构无关的main都又干了啥，从而解开心中的迷团。

1. start_kernel入口

　　排除掉架构相关的代码，就是到了/init/main.c 中的 start_kernel(), 从这里我们可以看到操作系统启动时，大致干了啥。

// /init/main.c
asmlinkage __visible void __init start_kernel(void)
{
    char *command_line;
    char *after_dashes;

    set_task_stack_end_magic(&init_task);
    smp_setup_processor_id();
    debug_objects_early_init();

    cgroup_init_early();

    local_irq_disable();
    early_boot_irqs_disabled = true;

    /*
     * Interrupts are still disabled. Do necessary setups, then
     * enable them.
     */
    boot_cpu_init();
    page_address_init();
    pr_notice("%s", linux_banner);
    setup_arch(&command_line);
    /*
     * Set up the the initial canary and entropy after arch
     * and after adding latent and command line entropy.
     */
    add_latent_entropy();
    add_device_randomness(command_line, strlen(command_line));
    boot_init_stack_canary();
    mm_init_cpumask(&init_mm);
    setup_command_line(command_line);
    setup_nr_cpu_ids();
    setup_per_cpu_areas();
    smp_prepare_boot_cpu();    /* arch-specific boot-cpu hooks */
    boot_cpu_hotplug_init();

    build_all_zonelists(NULL);
    page_alloc_init();

    pr_notice("Kernel command line: %s\n", boot_command_line);
    parse_early_param();
    after_dashes = parse_args("Booting kernel",
                  static_command_line, __start___param,
                  __stop___param - __start___param,
                  -1, -1, NULL, &unknown_bootoption);
    if (!IS_ERR_OR_NULL(after_dashes))
        parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
               NULL, set_init_arg);

    jump_label_init();

    /*
     * These use large bootmem allocations and must precede
     * kmem_cache_init()
     */
    setup_log_buf(0);
    vfs_caches_init_early();
    sort_main_extable();
    trap_init();
    mm_init();

    ftrace_init();

    /* trace_printk can be enabled here */
    early_trace_init();

    /*
     * Set up the scheduler prior starting any interrupts (such as the
     * timer interrupt). Full topology setup happens at smp_init()
     * time - but meanwhile we still have a functioning scheduler.
     */
    sched_init();
    /*
     * Disable preemption - early bootup scheduling is extremely
     * fragile until we cpu_idle() for the first time.
     */
    preempt_disable();
    if (WARN(!irqs_disabled(),
         "Interrupts were enabled *very* early, fixing it\n"))
        local_irq_disable();
    radix_tree_init();

    /*
     * Set up housekeeping before setting up workqueues to allow the unbound
     * workqueue to take non-housekeeping into account.
     */
    housekeeping_init();

    /*
     * Allow workqueue creation and work item queueing/cancelling
     * early.  Work item execution depends on kthreads and starts after
     * workqueue_init().
     */
    workqueue_init_early();

    rcu_init();

    /* Trace events are available after this */
    trace_init();

    if (initcall_debug)
        initcall_debug_enable();

    context_tracking_init();
    /* init some links before init_ISA_irqs() */
    early_irq_init();
    init_IRQ();
    tick_init();
    rcu_init_nohz();
    init_timers();
    hrtimers_init();
    softirq_init();
    timekeeping_init();
    time_init();
    sched_clock_postinit();
    printk_safe_init();
    perf_event_init();
    profile_init();
    call_function_init();
    WARN(!irqs_disabled(), "Interrupts were enabled early\n");
    early_boot_irqs_disabled = false;
    local_irq_enable();

    kmem_cache_init_late();

    /*
     * HACK ALERT! This is early. We're enabling the console before
     * we've done PCI setups etc, and console_init() must be aware of
     * this. But we do want output early, in case something goes wrong.
     */
    console_init();
    if (panic_later)
        panic("Too many boot %s vars at `%s'", panic_later,
              panic_param);

    lockdep_info();

    /*
     * Need to run this when irqs are enabled, because it wants
     * to self-test [hard/soft]-irqs on/off lock inversion bugs
     * too:
     */
    locking_selftest();

    /*
     * This needs to be called before any devices perform DMA
     * operations that might use the SWIOTLB bounce buffers. It will
     * mark the bounce buffers as decrypted so that their usage will
     * not cause "plain-text" data to be decrypted when accessed.
     */
    mem_encrypt_init();

#ifdef CONFIG_BLK_DEV_INITRD
    if (initrd_start && !initrd_below_start_ok &&
        page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
        pr_crit("initrd overwritten (0x%08lx < 0x%08lx) - disabling it.\n",
            page_to_pfn(virt_to_page((void *)initrd_start)),
            min_low_pfn);
        initrd_start = 0;
    }
#endif
    page_ext_init();
    kmemleak_init();
    debug_objects_mem_init();
    setup_per_cpu_pageset();
    numa_policy_init();
    acpi_early_init();
    if (late_time_init)
        late_time_init();
    calibrate_delay();
    pid_idr_init();
    anon_vma_init();
#ifdef CONFIG_X86
    if (efi_enabled(EFI_RUNTIME_SERVICES))
        efi_enter_virtual_mode();
#endif
    thread_stack_cache_init();
    cred_init();
    fork_init();
    proc_caches_init();
    uts_ns_init();
    buffer_init();
    key_init();
    security_init();
    dbg_late_init();
    vfs_caches_init();
    pagecache_init();
    signals_init();
    seq_file_init();
    proc_root_init();
    nsfs_init();
    cpuset_init();
    cgroup_init();
    taskstats_init_early();
    delayacct_init();

    check_bugs();

    acpi_subsystem_init();
    arch_post_acpi_subsys_init();
    sfi_init_late();

    if (efi_enabled(EFI_RUNTIME_SERVICES)) {
        efi_free_boot_services();
    }

    // 执行除了各种init之外的代码,就是创建首个线程之类的
    /* Do the rest non-__init'ed, we're now alive */
    rest_init();
}

/*
 * We need to finalize in a non-__init function or else race conditions
 * between the root thread and the init thread may cause start_kernel to
 * be reaped by free_initmem before the root thread has proceeded to
 * cpu_idle.
 *
 * gcc-3.4 accidentally inlines this function, so use noinline.
 */

static __initdata DECLARE_COMPLETION(kthreadd_done);
// main.c
static noinline void __ref rest_init(void)
{
    struct task_struct *tsk;
    int pid;

    rcu_scheduler_starting();
    /*
     * We need to spawn init first so that it obtains pid 1, however
     * the init task will end up wanting to create kthreads, which, if
     * we schedule it before we create kthreadd, will OOPS.
     */
    // 首先创建init进程，此进程pid=1
    pid = kernel_thread(kernel_init, NULL, CLONE_FS);
    /*
     * Pin init on the boot CPU. Task migration is not properly working
     * until sched_init_smp() has been run. It will set the allowed
     * CPUs for init to the non isolated CPUs.
     */
    rcu_read_lock();
    tsk = find_task_by_pid_ns(pid, &init_pid_ns);
    set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
    rcu_read_unlock();

    numa_default_policy();
    // 然后创建 kthreadd 进程，此进程pid=2
    pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
    rcu_read_lock();
    kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
    rcu_read_unlock();

    /*
     * Enable might_sleep() and smp_processor_id() checks.
     * They cannot be enabled earlier because with CONFIG_PREEMPT=y
     * kernel_thread() would trigger might_sleep() splats. With
     * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
     * already, but it's stuck on the kthreadd_done completion.
     */
    system_state = SYSTEM_SCHEDULING;

    complete(&kthreadd_done);

    /*
     * The boot idle thread must execute schedule()
     * at least once to get things moving:
     */
    schedule_preempt_disabled();
    /* Call into cpu_idle with preempt disabled */
    // idle 进程开启
    cpu_startup_entry(CPUHP_ONLINE);
}

　　同样，有大量的设备的init操作。但 rest_init() 稍微不太一样点，至少它和硬件关系不那么大了。它主要干三大件事：1. 初始化init进程; 2. 初始化kthreadd进程; 3. 初始化idle进程. 这三个东西，也许更值得多探探究竟。因为毕竟，硬件我们还是在外行了。

2. init进程的初始化过程

　　init进程，又叫第一个进程，即pid为1的进程，是系统必不可少的进程。那它都干了啥呢？我们来看一下：

// main.c
// 初始化进程，主要用于执行 /bin/init 等启动命令
static int __ref kernel_init(void *unused)
{
    int ret;
    // 初始化系统模块，开启用户空间
    kernel_init_freeable();
    /* need to finish all async __init code before freeing the memory */
    async_synchronize_full();
    ftrace_free_init_mem();
    jump_label_invalidate_initmem();
    free_initmem();
    mark_readonly();
    system_state = SYSTEM_RUNNING;
    numa_default_policy();

    rcu_end_inkernel_boot();

    if (ramdisk_execute_command) {
        ret = run_init_process(ramdisk_execute_command);
        if (!ret)
            return 0;
        pr_err("Failed to execute %s (error %d)\n",
               ramdisk_execute_command, ret);
    }

    /*
     * We try each of these until one succeeds.
     *
     * The Bourne shell can be used instead of init if we are
     * trying to recover a really broken machine.
     */
    if (execute_command) {
        ret = run_init_process(execute_command);
        if (!ret)
            return 0;
        panic("Requested init %s failed (error %d).",
              execute_command, ret);
    }
    // 执行以下init系统命令，以便将系统运行起来
    // 因各平台各配置不一致，故做多次尝试，但只要一次成功，则返回0
    if (!try_to_run_init_process("/sbin/init") ||
        !try_to_run_init_process("/etc/init") ||
        !try_to_run_init_process("/bin/init") ||
        !try_to_run_init_process("/bin/sh"))
        return 0;

    panic("No working init found.  Try passing init= option to kernel. "
          "See Linux Documentation/admin-guide/init.rst for guidance.");
}

// /init/main.c
static noinline void __init kernel_init_freeable(void)
{
    /*
     * Wait until kthreadd is all set-up.
     */
    wait_for_completion(&kthreadd_done);

    /* Now the scheduler is fully set up and can do blocking allocations */
    gfp_allowed_mask = __GFP_BITS_MASK;

    /*
     * init can allocate pages on any node
     */
    set_mems_allowed(node_states[N_MEMORY]);

    cad_pid = task_pid(current);

    smp_prepare_cpus(setup_max_cpus);
    // 将队列绑定到各cpu上，以便后续可以各自执行各自的任务
    workqueue_init();

    init_mm_internals();

    do_pre_smp_initcalls();
    lockup_detector_init();

    smp_init();
    sched_init_smp();

    page_alloc_init_late();
    // cpu已就绪，可以进行真正的初始化方法了
    do_basic_setup();

    /* Open the /dev/console on the rootfs, this should never fail */
    if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
        pr_err("Warning: unable to open an initial console.\n");

    (void) ksys_dup(0);
    (void) ksys_dup(0);
    /*
     * check if there is an early userspace init.  If yes, let it do all
     * the work
     */

    if (!ramdisk_execute_command)
        ramdisk_execute_command = "/init";

    if (ksys_access((const char __user *)
            ramdisk_execute_command, 0) != 0) {
        ramdisk_execute_command = NULL;
        prepare_namespace();
    }

    /*
     * Ok, we have completed the initial bootup, and
     * we're essentially up and running. Get rid of the
     * initmem segments and start the user-mode stuff..
     *
     * rootfs is available now, try loading the public keys
     * and default modules
     */

    integrity_load_keys();
    // 加载默认模块
    load_default_modules();
}

/*
 * Ok, the machine is now initialized. None of the devices
 * have been touched yet, but the CPU subsystem is up and
 * running, and memory and process management works.
 *
 * Now we can finally start doing some real work..
 */
static void __init do_basic_setup(void)
{
    cpuset_init_smp();
    shmem_init();
    driver_init();
    init_irq_proc();
    do_ctors();
    usermodehelper_enable();
    do_initcalls();
}

// /drivers/base/init.c  驱动初始化
/**
 * driver_init - initialize driver model.
 *
 * Call the driver model init functions to initialize their
 * subsystems. Called early from init/main.c.
 */
void __init driver_init(void)
{
    /* These are the core pieces */
    devtmpfs_init();
    devices_init();
    buses_init();
    classes_init();
    firmware_init();
    hypervisor_init();

    /* These are also core pieces, but must come after the
     * core core pieces.
     */
    platform_bus_init();
    cpu_dev_init();
    memory_dev_init();
    container_dev_init();
    of_core_init();
}


// /init/main.c
/*
 * This function requests modules which should be loaded by default and is
 * called twice right after initrd is mounted and right before init is
 * exec'd.  If such modules are on either initrd or rootfs, they will be
 * loaded before control is passed to userland.
 */
void __init load_default_modules(void)
{
    load_default_elevator_module();
}
// /block/elevator.c
/* called during boot to load the elevator chosen by the elevator param */
void __init load_default_elevator_module(void)
{
    struct elevator_type *e;

    if (!chosen_elevator[0])
        return;

    /*
     * Boot parameter is deprecated, we haven't supported that for MQ.
     * Only look for non-mq schedulers from here.
     */
    spin_lock(&elv_list_lock);
    e = elevator_find(chosen_elevator, false);
    spin_unlock(&elv_list_lock);

    if (!e)
        request_module("%s-iosched", chosen_elevator);
}

　　可以看到，init进程承担着非常重要的工作，它需要初始化内存，页，队列，cpu等等，还要创建用户空间，加载默认模块等等。并且更重要的是，它要负责执行开机启动程序，而这决定了我们的系统如何运行。它如此重要以至于，它作为第一个进程被创建出来。是一个不可少的进程。

3. kthreadd内核进程运行流程

　　继init进程之后，kthreadd是第二个运行的进程，它又是在干什么呢？实际上，它主要用于给各子进程创建时使用的。

// /include/linux/kthread.h
int kthreadd(void *unused)
{
    struct task_struct *tsk = current;

    /* Setup a clean context for our children to inherit. */
    // 让kthreadd进程尽量少各种特殊配置，以便各子进程生成时，会带有各种特异功能
    set_task_comm(tsk, "kthreadd");
    ignore_signals(tsk);
    set_cpus_allowed_ptr(tsk, cpu_all_mask);
    set_mems_allowed(node_states[N_MEMORY]);

    current->flags |= PF_NOFREEZE;
    cgroup_init_kthreadd();

    for (;;) {
        set_current_state(TASK_INTERRUPTIBLE);
        if (list_empty(&kthread_create_list))
            // 上下文切换，即主动放弃cpu，此处是汇编实现
            schedule();
        __set_current_state(TASK_RUNNING);

        spin_lock(&kthread_create_lock);
        while (!list_empty(&kthread_create_list)) {
            struct kthread_create_info *create;

            create = list_entry(kthread_create_list.next,
                        struct kthread_create_info, list);
            list_del_init(&create->list);
            spin_unlock(&kthread_create_lock);
            // 创建一个内核线程（进程）
            create_kthread(create);

            spin_lock(&kthread_create_lock);
        }
        spin_unlock(&kthread_create_lock);
    }

    return 0;
}

// /kernel/kthread.c   创建一个内核线程（进程）
static void create_kthread(struct kthread_create_info *create)
{
    int pid;

#ifdef CONFIG_NUMA
    current->pref_node_fork = create->node;
#endif
    /* We want our own signal handler (we take no signals by default). */
    pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
    if (pid < 0) {
        /* If user was SIGKILLed, I release the structure. */
        struct completion *done = xchg(&create->done, NULL);

        if (!done) {
            kfree(create);
            return;
        }
        create->result = ERR_PTR(pid);
        complete(done);
    }
}

　　可见 kthreadd 的作用就是不停地根据需要，创建一个个的内核进程线程咯。

4. idle进程

　　idle进程是在启动后做的一件事。它的作用就是，不停的运行，保持cpu的活性。

// kernel/sched/idle.c
void cpu_startup_entry(enum cpuhp_state state)
{
    /*
     * This #ifdef needs to die, but it's too late in the cycle to
     * make this generic (ARM and SH have never invoked the canary
     * init for the non boot CPUs!). Will be fixed in 3.11
     */
#ifdef CONFIG_X86
    /*
     * If we're the non-boot CPU, nothing set the stack canary up
     * for us. The boot CPU already has it initialized but no harm
     * in doing it again. This is a good place for updating it, as
     * we wont ever return from this function (so the invalid
     * canaries already on the stack wont ever trigger).
     */
    boot_init_stack_canary();
#endif
    arch_cpu_idle_prepare();
    cpuhp_online_idle(state);
    // 永不停止的 do_idle
    while (1)
        do_idle();
}

/*
 * Generic idle loop implementation
 *
 * Called with polling cleared.
 */
static void do_idle(void)
{
    int cpu = smp_processor_id();
    /*
     * If the arch has a polling bit, we maintain an invariant:
     *
     * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
     * rq->idle). This means that, if rq->idle has the polling bit set,
     * then setting need_resched is guaranteed to cause the CPU to
     * reschedule.
     */

    __current_set_polling();
    tick_nohz_idle_enter();

    while (!need_resched()) {
        check_pgt_cache();
        rmb();

        if (cpu_is_offline(cpu)) {
            tick_nohz_idle_stop_tick_protected();
            cpuhp_report_idle_dead();
            arch_cpu_idle_dead();
        }

        local_irq_disable();
        arch_cpu_idle_enter();

        /*
         * In poll mode we reenable interrupts and spin. Also if we
         * detected in the wakeup from idle path that the tick
         * broadcast device expired for us, we don't want to go deep
         * idle as we know that the IPI is going to arrive right away.
         */
        if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
            tick_nohz_idle_restart_tick();
            // 轮循 idle
            cpu_idle_poll();
        } else {
            cpuidle_idle_call();
        }
        arch_cpu_idle_exit();
    }

    /*
     * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
     * be set, propagate it into PREEMPT_NEED_RESCHED.
     *
     * This is required because for polling idle loops we will not have had
     * an IPI to fold the state for us.
     */
    preempt_set_need_resched();
    tick_nohz_idle_exit();
    __current_clr_polling();

    /*
     * We promise to call sched_ttwu_pending() and reschedule if
     * need_resched() is set while polling is set. That means that clearing
     * polling needs to be visible before doing these things.
     */
    smp_mb__after_atomic();

    sched_ttwu_pending();
    schedule_idle();

    if (unlikely(klp_patch_pending(current)))
        klp_update_patch_state(current);
}

static noinline int __cpuidle cpu_idle_poll(void)
{
    rcu_idle_enter();
    trace_cpu_idle_rcuidle(0, smp_processor_id());
    local_irq_enable();
    stop_critical_timings();

    while (!tif_need_resched() &&
        (cpu_idle_force_poll || tick_check_broadcast_expired()))
        cpu_relax();
    start_critical_timings();
    trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
    rcu_idle_exit();

    return 1;
}

// arch/sh/include/asm/processor.h
#define cpu_relax()    barrier()

// arch/powerpc/boot/io.h
static inline void barrier(void)
{
    asm volatile("" : : : "memory");
}

　　idle 进程就是不停地运行检测，然后调用cpu命令进行休眠。

　　当然了，在有的精简系统中，idle进程并非是必须的，但其思想却是值得一学的。

标签：之三,boot,idle,好奇心,init,void,linux,create,cpu
来源： https://www.cnblogs.com/yougewe/p/15837529.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9

关于linux的一点好奇心（二）：linux启动过程之三大进程

1. start_kernel入口

2. init进程的初始化过程

3. kthreadd内核进程运行流程

4. idle进程