structsched_entity { /* For load-balancing: */ structload_weightload; unsignedlong runnable_weight; structrb_noderun_node; structlist_headgroup_node; unsignedint on_rq; u64 exec_start; u64 sum_exec_runtime; u64 vruntime; u64 prev_sum_exec_runtime; u64 nr_migrations; structsched_statisticsstatistics; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; structsched_entity *parent; /* rq on which this entity is (to be) queued: */ structcfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ structcfs_rq *my_q; #endif #ifdef CONFIG_SMP /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ structsched_avgavg; #endif };
在调度时,多个任务调度实体会首先区分是实时任务还是普通任务,然后通过以时间为顺序的红黑树结构组合起来,vruntime 最小的在树的左侧,vruntime最多的在树的右侧。以CFS策略为例,则会选择红黑树最左边的叶子节点作为下一个将获得 CPU 的任务。而这颗红黑树,我们称之为运行时队列(run queue),即struct rq。
/* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues * (such as the load balancing or the thread migration code), lock * acquire operations must be ordered by ascending &runqueue. */ structrq { /* runqueue lock: */ raw_spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ unsignedint nr_running; ...... #define CPU_LOAD_IDX_MAX 5 unsignedlong cpu_load[CPU_LOAD_IDX_MAX]; ...... /* capture load from *all* tasks on this CPU: */ structload_weightload; unsignedlong nr_load_updates; u64 nr_switches; structcfs_rqcfs; structrt_rqrt; structdl_rqdl; ...... /* * This is part of a global counter where only the total sum * over all CPUs matters. A task can increase this counter on * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ unsignedlong nr_uninterruptible; structtask_struct *curr; structtask_struct *idle; structtask_struct *stop; unsignedlong next_balance; structmm_struct *prev_mm; unsignedint clock_update_flags; u64 clock; /* Ensure that all clocks are in the same cache line */ u64 clock_task ____cacheline_aligned; u64 clock_pelt; unsignedlong lost_idle_time; atomic_t nr_iowait; ...... /* calc_load related fields */ unsignedlong calc_load_update; long calc_load_active; ...... };
/* CFS-related fields in a runqueue */ structcfs_rq { structload_weightload; unsignedlong runnable_weight; unsignedint nr_running; unsignedint h_nr_running; u64 exec_clock; u64 min_vruntime; #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif structrb_root_cachedtasks_timeline; /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ structsched_entity *curr; structsched_entity *next; structsched_entity *last; structsched_entity *skip; ...... };
/* Deadline class' related fields in a runqueue */ structdl_rq { /* runqueue is an rbtree, ordered by deadline */ structrb_root_cachedroot; unsignedlong dl_nr_running; #ifdef CONFIG_SMP /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates * the decision whether or not a ready but not running task * should migrate somewhere else. */ struct { u64 curr; u64 next; } earliest_dl; unsignedlong dl_nr_migratory; int overloaded; /* * Tasks on this rq that can be pushed away. They are kept in * an rb-tree, ordered by tasks' deadlines, with caching * of the leftmost (earliest deadline) element. */ structrb_root_cachedpushable_dl_tasks_root; #else structdl_bwdl_bw; #endif /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a * task blocks */ u64 running_bw; /* * Utilization of the tasks "assigned" to this runqueue (including * the tasks that are in runqueue and the tasks that executed on this * CPU and blocked). Increased when a task moves to this runqueue, and * decreased when the task moves away (migrates, changes scheduling * policy, or terminates). * This is needed to compute the "inactive utilization" for the * runqueue (inactive utilization = this_bw - running_bw). */ u64 this_bw; u64 extra_bw; /* * Inverse of the fraction of CPU utilization that can be reclaimed * by the GRUB algorithm. */ u64 bw_ratio; };
/* * __schedule() is the main scheduler function. * The main means of driving the scheduler and thus entering this function are: * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return * paths. For example, see arch/x86/entry_64.S. * To drive preemption between tasks, the scheduler sets the flag in timer * interrupt handler scheduler_tick(). * 3. Wakeups don't really cause entry into schedule(). They add a * task to the run-queue and that's it. * Now, if the new task added to the run-queue preempts the current * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets * called on the nearest possible occasion: * - If the kernel is preemptible (CONFIG_PREEMPT=y): * - in syscall or exception context, at the next outmost * preempt_enable(). (this might be as soon as the wake_up()'s * spin_unlock()!) * - in IRQ context, return from interrupt-handler to * preemptible context * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) * then at the next: * - cond_resched() call * - explicit schedule() call * - return from syscall or exception to user-space * - return from interrupt-handler to user-space * WARNING: must be called with preemption disabled! */ staticvoid __sched notrace __schedule(bool preempt) { structtask_struct *prev, *next; unsignedlong *switch_count; structrq_flagsrf; structrq *rq; int cpu; //从当前的CPU中取出任务队列rq,prev赋值为当前任务 cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; //检测当前任务是否可以调度 schedule_debug(prev); if (sched_feat(HRTICK)) hrtick_clear(rq); //禁止中断,RCU抢占关闭,队列加锁,SMP加锁 local_irq_disable(); rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). * * The membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); switch_count = &prev->nivcsw; if (!preempt && prev->state) { //不可中断的任务则继续执行 if (signal_pending_state(prev->state, prev)) { prev->state = TASK_RUNNING; } else { //当前任务从队列rq中出队,on_rq设置为0,如果存在I/O未完成则延时完成 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); prev->on_rq = 0; if (prev->in_iowait) { atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } /* 唤醒睡眠进程 * If a worker went to sleep, notify and ask workqueue * whether it wants to wake up a task to maintain * concurrency. */ if (prev->flags & PF_WQ_WORKER) { structtask_struct *to_wakeup; to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) try_to_wake_up_local(to_wakeup, &rf); } } switch_count = &prev->nvcsw; } // 调用pick_next_task获取下一个任务,赋值给next next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); // 如果产生了任务切换,则需要切换上下文 if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; /* * The membarrier system call requires each architecture * to have a full memory barrier after updating * rq->curr, before returning to user-space. * * Here are the schemes providing that barrier on the * various architectures: * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. * - finish_lock_switch() for weakly-ordered * architectures where spin_unlock is a full barrier, * - switch_to() for arm64 (weakly-ordered, spin_unlock * is a RELEASE barrier), */ ++*switch_count; trace_sched_switch(preempt, prev, next); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { // 清除标记位,重开中断 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); } //队列自平衡:红黑树平衡操作 balance_callback(rq); }
/* * Pick up the highest-prio task: */ staticinlinestruct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { conststructsched_class *class; structtask_struct *p; /* 这里做了一个优化:如果是普通调度策略则直接调用fair_sched_class中的pick_next_task * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a * higher scheduling class, because otherwise those loose the * opportunity to pull in more work from other CPUs. */ if (likely((prev->sched_class == &idle_sched_class || prev->sched_class == &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto again; /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) p = idle_sched_class.pick_next_task(rq, prev, rf); return p; } again: //依次调用类中的选择函数,如果正确选择到下一个任务则返回 for_each_class(class) { p = class->pick_next_task(rq, prev, rf); if (p) { if (unlikely(p == RETRY_TASK)) goto again; return p; } } /* The idle class should always have a runnable task: */ BUG(); }
下面来看看上下文切换。上下文切换主要干两件事情,一是切换任务空间,也即虚拟内存;二是切换寄存器和 CPU 上下文。关于任务空间的切换放在内存部分的文章中详细介绍,这里先按下不表,通过任务空间切换实际完成了用户态的上下文切换工作。下面我们重点看一下内核态切换,即寄存器和CPU上下文的切换。
/* * context_switch - switch to the new MM and the new thread's register state. */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { structmm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into * one hypercall. */ arch_start_context_switch(prev); /* * If mm is non-NULL, we pass through switch_mm(). If mm is * NULL, we will pass through mmdrop() in finish_task_switch(). * Both of these contain the full memory barrier required by * membarrier after storing to rq->curr, before returning to * user-space. */ if (!mm) { next->active_mm = oldmm; mmgrab(oldmm); enter_lazy_tlb(oldmm, next); } else switch_mm_irqs_off(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; rq->prev_mm = oldmm; } rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); //barrier 语句是一个编译器指令,用于保证 switch_to 和 finish_task_switch 的执行顺序不会因为编译阶段优化而改变 barrier(); return finish_task_switch(prev); }
A保存内核栈和寄存器,切换至B,此时prev = A, next = B,该状态会保存在栈里,等下次调用A的时候再恢复。然后调用B的finish_task_switch()继续执行下去,返回B的队列rq,
B保存内核栈和寄存器,切换至C
C保存内核栈和寄存器,切换至A。A从barrier()开始运行,而A从步骤1中保存的prev = A, next = B则完美的避开了C,丢失了C的信息。因此last指针的重要性就出现了。在执行完__switch_to_asm后,A的内核栈和寄存器重新覆盖了prev和next,但是我们通过返回值提供了C的内存地址,保存在last中,在finish_task_switch中完成清理工作。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
#define switch_to(prev, next, last) \ do { \ prepare_switch_to(next); \ \ ((last) = __switch_to_asm((prev), (next))); \ } while (0)
/* * switch_to(x,y) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time * (as a call from the fsave or fwait in effect) rather than to * the wrong process. Lazy FP saving no longer makes any sense * with modern CPU's, and this simplifies a lot of things (SMP * and UP become the same). * * NOTE! We used to use the x86 hardware context switching. The * reason for not using it any more becomes apparent when you * try to recover gracefully from saved state that is no longer * valid (stale segment register values in particular). With the * hardware task-switch, there is no way to fix up bad state in * a reasonable manner. * * The fact that Intel documents the hardware task-switching to * be slow is a fairly red herring - this code is not noticeably * faster. However, there _is_ some room for improvement here, * so the performance issues may eventually be a valid point. * More important, however, is the fact that this allows us much * more flexibility. * * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ __visible __notrace_funcgraph structtask_struct * __switch_to(structtask_struct *prev_p, structtask_struct *next_p) { structthread_struct *prev = &prev_p->thread, *next = &next_p->thread; structfpu *prev_fpu = &prev->fpu; structfpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this * before setting the new TLS descriptors avoids the situation * where we temporarily have non-reloadable segments in %fs * and %gs. This could be an issue if the NMI handler ever * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ lazy_save_gs(prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. */ load_TLS(next, cpu); /* * Restore IOPL if needed. In normal use, the flags restore * in the switch assembly will handle this. But if the kernel * is running virtualized at a non-zero CPL, the popf will * not restore flags, so it must be done in a separate step. */ if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) set_iopl_mask(next->iopl); switch_to_extra(prev_p, next_p); /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so * the GDT and LDT are properly updated, and must be * done before fpu__restore(), so the TS bit is up * to date. */ arch_end_context_switch(next_p); /* * Reload esp0 and cpu_current_top_of_stack. This changes * current_thread_info(). Refresh the SYSENTER configuration in * case prev or next is vm86. */ update_task_stack(next_p); refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsignedlong)task_stack_page(next_p) + THREAD_SIZE); /* * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) lazy_load_gs(next->gs); switch_fpu_finish(next_fpu, cpu); this_cpu_write(current_task, next_p); /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); return prev_p; }
/** * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired * with a prepare_task_switch call before the context switch. * finish_task_switch will reconcile locking set up by prepare_task_switch, * and do any other architecture-specific cleanup actions. * * Note that we may have delayed dropping an mm in context_switch(). If * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) * * The context switch have flipped the stack from under us and restored the * local variables which were saved when this task called schedule() in the * past. prev == current is still correct but we need to recalculate this_rq * because prev may have moved to another CPU. */ staticstruct rq *finish_task_switch(struct task_struct *prev) __releases(rq->lock) { structrq *rq = this_rq(); structmm_struct *mm = rq->prev_mm; long prev_state; /* * The previous task will have left us with a preempt_count of 2 * because it left us after: * * schedule() * preempt_disable(); // 1 * __schedule() * raw_spin_lock_irq(&rq->lock) // 2 * * Also, see FORK_PREEMPT_COUNT. */ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, "corrupted preempt_count: %s/%d/0x%x\n", current->comm, current->pid, preempt_count())) preempt_count_set(FORK_PREEMPT_COUNT); rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. * * We must observe prev->state before clearing prev->on_cpu (in * finish_task), otherwise a concurrent wakeup can get prev * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); finish_task(prev); finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); fire_sched_in_preempt_notifiers(current); /* * When switching through a kernel thread, the loop in * membarrier_{private,global}_expedited() may have observed that * kernel thread and not issued an IPI. It is therefore possible to * schedule between user->kernel->user threads without passing though * switch_mm(). Membarrier requires a barrier after storing to * rq->curr, before returning to userspace, so provide them here: * * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly * provided by mmdrop(), * - a sync_core for SYNC_CORE. */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); /* * Remove function-return probe instances associated with this * task and put them back on the free list. */ kprobe_flush_task(prev); /* Task is done with its stack. */ put_task_stack(prev); put_task_struct(prev); } tick_nohz_task_switch(); return rq; }
/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. */ voidscheduler_tick(void) { int cpu = smp_processor_id(); structrq *rq = cpu_rq(cpu); structtask_struct *curr = rq->curr; structrq_flagsrf; sched_clock_tick(); rq_lock(rq, &rf); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); psi_task_tick(rq); rq_unlock(rq, &rf); perf_event_task_tick(); ...... }
/* * scheduler tick hitting a task of our scheduling class. * NOTE: This function can be called remotely by the tick offload that * goes along full dynticks. Therefore no local assumption can be made * and everything must be accessed through the @rq and @curr passed in * parameters. */ staticvoidtask_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { structcfs_rq *cfs_rq; structsched_entity *se = &curr->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); update_misfit_status(curr, rq); update_overutilized_status(task_rq(curr)); }
/* * Preempt the current task with a newly woken task if needed: */ staticvoid check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsignedlong ideal_runtime, delta_exec; structsched_entity *se; s64 delta; ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); return; } /* * Ensure that a task that missed wakeup preemption by a * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */ if (delta_exec < sysctl_sched_min_granularity) return; se = __pick_first_entity(cfs_rq); delta = curr->vruntime - se->vruntime; if (delta < 0) return; if (delta > ideal_runtime) resched_curr(rq_of(cfs_rq)); }
/* * resched_curr - mark rq's current task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ voidresched_curr(struct rq *rq) { structtask_struct *curr = rq->curr; int cpu; ....... cpu = cpu_of(rq); if (cpu == smp_processor_id()) { set_tsk_need_resched(curr); set_preempt_need_resched(); return; } if (set_nr_and_not_polling(curr)) smp_send_reschedule(cpu); else trace_sched_wake_idle_without_ipi(cpu); }