红联Linux门户
Linux帮助

Linux内核之进程调度

发布时间:2014-12-01 15:48:45来源:linux网站作者:bullbat

等待队列

Sleep相关函数将进程的状态设置为非运行态,在下一次调度来时,将在schedule函数中将本进程从运行队列中移除。sleep函数将进程加入等待队列,然后调用schedule函数选择并重新开始另一个程序的执行。当调用wake_up类函数将进程唤醒时,wake_up类函数将进程加入运行队列中,调度程序重新从sleep函数中下一条没有执行的指令开始执行。

sleep类函数都调用sleep_on_common函数实现,只是传入的参数有别。

static long __sched 
sleep_on_common(wait_queue_head_t *q, int state, long timeout) 

unsigned long flags; 
wait_queue_t wait; 
/*初始化等待队列*/ 
init_waitqueue_entry(&wait, current); 
/*设置当前进程状态*/ 
__set_current_state(state); 
 
spin_lock_irqsave(&q->lock, flags); 
__add_wait_queue(q, &wait);/*加入等待队列中*/ 
spin_unlock(&q->lock); 
/*sleep until timeout,在本进程睡眠的过程中会调用别的进程运行*/ 
timeout = schedule_timeout(timeout); 
spin_lock_irq(&q->lock); 
/*当本进程被唤醒时,从这里继续开始运行
也就是将该进程从等待队列中移除*/ 
__remove_wait_queue(q, &wait); 
spin_unlock_irqrestore(&q->lock, flags); 
 
return timeout; 

static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) 

q->flags = 0; 
q->private = p;/*将进程保存为队列私有属性*/ 
q->func = default_wake_function;/*设定为缺省的唤醒函数*/ 
}


我们看唤醒函数,default_wake_function最终调用函数try_to_wake_up

/***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
 * @state: the mask of task states that can be woken
 * @sync: do a synchronous wakeup?
 *
 * Put it on the run-queue if it's not already there. The "current"
 * thread is always on the run-queue (except when the actual
 * re-schedule is in progress), and as such you're allowed to do
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
 * returns failure only if the task is already active.
 */ 
static int try_to_wake_up(struct task_struct *p, unsigned int state, 
  int wake_flags) 

int cpu, orig_cpu, this_cpu, success = 0; 
unsigned long flags; 
struct rq *rq, *orig_rq; 
 
if (!sched_feat(SYNC_WAKEUPS)) 
wake_flags &= ~WF_SYNC;/* waker not goes to sleep after wakup */ 
 
this_cpu = get_cpu();/*cpu id*/ 
 
smp_wmb(); 
rq = orig_rq = task_rq_lock(p, &flags);/*获得进程的rq*/ 
update_rq_clock(rq);/*更新rq的时钟*/ 
if (!(p->state & state)) 
goto out; 
 
if (p->se.on_rq)/*如果进程已经在运行队列中*/ 
goto out_running; 
 
cpu = task_cpu(p);/*返回进程对应的cpu*/ 
orig_cpu = cpu; 
 
#ifdef CONFIG_SMP  
if (unlikely(task_running(rq, p)))/*如果当前进程时p,也就是waker*/ 
goto out_activate; 
 
/*
 * In order to handle concurrent wakeups and release the rq->lock
 * we put the task in TASK_WAKING state.
 *
 * First fix up the nr_uninterruptible count:
 */ 
if (task_contributes_to_load(p)) 
rq->nr_uninterruptible--; 
p->state = TASK_WAKING; 
task_rq_unlock(rq, &flags); 
/*通常用在執行一個新的程序,或是WakeUp
一個Task時,會根據目前SMP下每個處理器的
負荷,決定Task是否要切換到另一個處理器
的RunQueue去執行,執行時會返回最後目標
處理器的值.*/ 
cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 
if (cpu != orig_cpu) 
set_task_cpu(p, cpu);/*设置task在制定的cpu上运行*/ 
 
rq = task_rq_lock(p, &flags);/*task对应的rq*/ 
 
if (rq != orig_rq) 
update_rq_clock(rq);/*更新clock*/ 
 
WARN_ON(p->state != TASK_WAKING); 
cpu = task_cpu(p); 
 
#ifdef CONFIG_SCHEDSTATS/*yes*/  
schedstat_inc(rq, ttwu_count);/*Wake Up Task的次數加一.*/ 
if (cpu == this_cpu) 
/*Wake Up 同一個處理器Task的次數加一.*/ 
schedstat_inc(rq, ttwu_local); 
else { 
struct sched_domain *sd; 
for_each_domain(this_cpu, sd) { 
if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 
schedstat_inc(sd, ttwu_wake_remote); 
break; 



#endif /* CONFIG_SCHEDSTATS */  
 
out_activate: 
#endif /* CONFIG_SMP */  
/*下面为设置相关计数变量*/ 
schedstat_inc(rq, field)(p, se.nr_wakeups); 
if (wake_flags & WF_SYNC) 
schedstat_inc(p, se.nr_wakeups_sync); 
if (orig_cpu != cpu) 
schedstat_inc(p, se.nr_wakeups_migrate); 
if (cpu == this_cpu) 
schedstat_inc(p, se.nr_wakeups_local); 
else 
schedstat_inc(p, se.nr_wakeups_remote); 
/*将进程移动到对应调度类的运行队列*/ 
activate_task(rq, p, 1); 
success = 1; 
 
/*
 * Only attribute actual wakeups done by this task.
 */ 
if (!in_interrupt()) {/*下面为对se中变量last_wakeup和
avg_wakeup的更新*/ 
struct sched_entity *se = ¤t->se; 
u64 sample = se->sum_exec_runtime; 
 
if (se->last_wakeup) 
sample -= se->last_wakeup; 
else 
sample -= se->start_runtime; 
update_avg(&se->avg_wakeup, sample); 
 
se->last_wakeup = se->sum_exec_runtime; 

 
out_running: 
trace_sched_wakeup(rq, p, success); 
 
/*用以決定一個Task是否可以中斷目前正在
運作的Task,取得執行權.*/ 
check_preempt_curr(rq, p, wake_flags); 
 
p->state = TASK_RUNNING; 
#ifdef CONFIG_SMP  
if (p->sched_class->task_wake_up) 
p->sched_class->task_wake_up(rq, p); 
 
if (unlikely(rq->idle_stamp)) {/*该值可用以表示這個
處理器是何時進入到Idle的
狀態,在这里得到更新*/ 
u64 delta = rq->clock - rq->idle_stamp; 
u64 max = 2*sysctl_sched_migration_cost; 
 
if (delta > max) 
rq->avg_idle = max; 
else/*avg_idle可反應目前處理器進入Idle狀態的時間長短*/ 
update_avg(&rq->avg_idle, delta); 
rq->idle_stamp = 0; 

#endif  
out: 
task_rq_unlock(rq, &flags); 
put_cpu(); 
 
return success; 

所有的wake_up类函数都最终调用__wake_up_common函数实现

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 
int nr_exclusive, int wake_flags, void *key) 

wait_queue_t *curr, *next; 
 
list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 
unsigned flags = curr->flags; 
 
if (curr->func(curr, mode, wake_flags, key) &&/*在这里会调用上面注册的try_to_wake_up函数*/ 
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 
break; 

}


wait_event方式

考虑到sleep_on类函数在以下条件中不能使用,那就是必须测试条件并且当条件还没哟得到验证时又紧接着让进城去睡眠;为实现这样的功能,内核采用wait_event的方式实现。

#define __wait_event(wq, condition) \  
do {\ 
DEFINE_WAIT(__wait);\ 

for (;;) {  /*加入等待队列,设置进程状态*/   \ 
prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);\ 
if (condition)  \ 
break;  \ 
schedule();/*调用其他进程运行*/ \ 
}/*当进程被唤醒时继续如下执行*/  \ 
finish_wait(&wq, &__wait);  \ 
} while (0)


当下一次调度到来时,调度程序把设置为非运行的当前进程从运行队列里面删除,而进程被wake_up类函数唤醒时,wake_up类函数将其加入运行队列,继续执行上面没有执行完成的wait_event函数(执行finish_wait函数),finish_wait函数将其从等待队列中删除。

linux调度中,在schedule函数中完成选择下一个进行、进程间切换进程的切换在schedule函数中主要由两个函数完成:

sched_info_switch(prev, next);主要是更新切换出去和进来进程以及对应rq的相关变量。该函数主要调用__sched_info_switch函数来实现。
 
/*
 * Called when tasks are switched involuntarily due, typically, to expiring
 * their time slice.  (This may also be called when switching to or from
 * the idle task.)  We are only called when prev != next.
 */ 
static inline void 
__sched_info_switch(struct task_struct *prev, struct task_struct *next) 

struct rq *rq = task_rq(prev); 
 
/*
 * prev now departs the cpu.  It's not interesting to record
 * stats about how efficient we were at scheduling the idle
 * process, however.
 */ 
if (prev != rq->idle)/*如果被切换出去的进程不是idle进程*/ 
sched_info_depart(prev);/*更新prev进程和他对应rq的相关变量*/ 
 
if (next != rq->idle)/*如果切换进来的进程不是idle进程*/ 
sched_info_arrive(next);/*更新next进程和对应队列的相关变量*/ 
}

/*
 * Called when a process ceases being the active-running process, either
 * voluntarily or involuntarily.  Now we can calculate how long we ran.
 * Also, if the process is still in the TASK_RUNNING state, call
 * sched_info_queued() to mark that it has now again started waiting on
 * the runqueue.
 */ 
static inline void sched_info_depart(struct task_struct *t) 

/*计算在进程在rq中运行的时间长度*/ 
unsigned long long delta = task_rq(t)->clock - 
t->sched_info.last_arrival; 
/*更新RunQueue中的Task所得到CPU執行
時間的累加值.*/ 
rq_sched_info_depart(task_rq(t), delta); 
 
/*如果被切换出去进程的状态是运行状态
那么将进程sched_info.last_queued设置为rq的clock
last_queued为最后一次排队等待运行的时间*/ 
if (t->state == TASK_RUNNING) 
sched_info_queued(t); 

/*
 * Called when a task finally hits the cpu.  We can now calculate how
 * long it was waiting to run.  We also note when it began so that we
 * can keep stats on how long its timeslice is.
 */ 
static void sched_info_arrive(struct task_struct *t) 

unsigned long long now = task_rq(t)->clock, delta = 0; 
 
if (t->sched_info.last_queued)/*如果被切换进来前在运行进程中排队*/ 
delta = now - t->sched_info.last_queued;/*计算排队等待的时间长度*/ 
sched_info_reset_dequeued(t);/*因为进程将被切换进来运行,设定last_queued为0*/ 
t->sched_info.run_delay += delta;/*更新进程在运行队列里面等待的时间*/ 
t->sched_info.last_arrival = now;/*更新最后一次运行的时间*/ 
t->sched_info.pcount++;/*cpu上运行的次数加一*/ 
/*更新rq中rq_sched_info中的对应的变量*/ 
rq_sched_info_arrive(task_rq(t), delta); 
}
 

context_switch函数完成主要的硬件、寄存器等实际的切换工作。
 
/*
 * context_switch - switch to the new MM and the new
 * thread's register state.
 */ 
static inline void 
context_switch(struct rq *rq, struct task_struct *prev, 
   struct task_struct *next) 

struct mm_struct *mm, *oldmm; 
 
prepare_task_switch(rq, prev, next); 
trace_sched_switch(rq, prev, next); 
mm = next->mm; 
oldmm = prev->active_mm; 
/*
 * For paravirt, this is coupled with an exit in switch_to to
 * combine the page table reload and the switch backend into
 * one hypercall.
 */ 
arch_start_context_switch(prev); 
 
if (unlikely(!mm)) {/*如果被切换进来的进程的mm为空*/ 
next->active_mm = oldmm;/*将共享切换出去进程的active_mm*/ 
atomic_inc(&oldmm->mm_count);/*有一个进程共享,所有引用计数加一*/ 
/*将per cpu变量cpu_tlbstate状态设为LAZY*/ 
enter_lazy_tlb(oldmm, next); 
} else/*如果mm不会空,那么进行mm切换*/ 
switch_mm(oldmm, mm, next); 
 
if (unlikely(!prev->mm)) {/*如果切换出去的mm为空,从上面
可以看出本进程的active_mm为共享先前切换出去的进程
的active_mm,所有需要在这里置空*/ 
prev->active_mm = NULL; 
rq->prev_mm = oldmm; /*更新rq的前一个mm结构*/ 

/*
 * Since the runqueue lock will be released by the next
 * task (which is an invalid locking op but in the case
 * of the scheduler it's an obvious special-case), so we
 * do an early lockdep release here:
 */ 
#ifndef __ARCH_WANT_UNLOCKED_CTXSW  
spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 
#endif  
 
/* Here we just switch the register state and the stack. */ 
switch_to(prev, next, prev); 
 
barrier(); 
/*
 * this_rq must be evaluated again because prev may have moved
 * CPUs since it called schedule(), thus the 'rq' on its stack
 * frame will be invalid.
 */ 
finish_task_switch(this_rq(), prev); 

static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 
 struct task_struct *tsk) 

unsigned cpu = smp_processor_id(); 
 
if (likely(prev != next)) { 
/* stop flush ipis for the previous mm */ 
/*将被替换进程使用的内存描述结构的CPU
掩码中当前处理器号对应的位码清0*/ 
cpumask_clear_cpu(cpu, mm_cpumask(prev)); 
#ifdef CONFIG_SMP  
/*设置per cpu变量tlb*/ 
percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 
percpu_write(cpu_tlbstate.active_mm, next); 
#endif  
/*将要被调度运行进程拥有的内存描述结构
的CPU掩码中当前处理器号对应的位码设置为1*/ 
cpumask_set_cpu(cpu, mm_cpumask(next)); 
 
/* Re-load page tables */ 
load_cr3(next->pgd);/*将切换进来进程的pgd load到cr3寄存器*/ 
 
/*
 * load the LDT, if the LDT is different:
 */ 
if (unlikely(prev->context.ldt != next->context.ldt)) 
load_LDT_nolock(&next->context); 

#ifdef CONFIG_SMP  
else {/*如果切换的两个进程相同*/ 
percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 
BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); 
 
if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { 
/* We were in lazy tlb mode and leave_mm disabled
 * tlb flush IPI delivery. We must reload CR3
 * to make sure to use no freed page tables.
 */ 
load_cr3(next->pgd); 
load_LDT_nolock(&next->context); 


#endif  
}
 

具体寄存器相关的切换由函数switch_to完成,改函数用汇编代码保持各种寄存器的值,然后调用c函数__switch_to,

汇编中实现了具体的切换:

/*
 * Saving eflags is important. It switches not only IOPL between tasks,
 * it also protects other tasks from NT leaking through sysenter etc.
 */ 
#define switch_to(prev, next, last) \  
do {\ 
/*  \
 * Context-switching clobbers all registers, so we clobber  \
 * them explicitly, via unused output variables.\
 * (EAX and EBP is not listed because EBP is saved/restored \
 * explicitly for wchan access and EAX is the return value of   \
 * __switch_to())   \
 */ \ 
unsigned long ebx, ecx, edx, esi, edi;  \ 

asm volatile("pushfl\n\t"   /* saveflags */ \ 
 "pushl %%ebp\n\t"  /* saveEBP   */ \ 
 "movl %%esp,%[prev_sp]\n\t"/* saveESP   */ \ 
 "movl %[next_sp],%%esp\n\t"/* restore ESP   */ \ 
 "movl $1f,%[prev_ip]\n\t"  /* saveEIP   */ \ 
/*将next_ip入栈,下面用jmp跳转,这样
返回到标号1时就切换过来了*/ 
 "pushl %[next_ip]\n\t" /* restore EIP   */ \ 
 __switch_canary\ 
 "jmp __switch_to\n"/* regparm call  */ \ 
 "1:\t" \ 
 /*切换到新进程的第一条指令*/ 
 "popl %%ebp\n\t"   /* restore EBP   */ \ 
 "popfl\n"  /* restore flags */ \ 

 /* output parameters */\ 
 : [prev_sp] "=m" (prev->thread.sp), \ 
   [prev_ip] "=m" (prev->thread.ip), \ 
   "=a" (last), \ 

   /* clobbered output registers: */\ 
   "=b" (ebx), "=c" (ecx), "=d" (edx),  \ 
   "=S" (esi), "=D" (edi)   \ 

   __switch_canary_oparam   \ 

   /* input parameters: */  \ 
 : [next_sp]  "m" (next->thread.sp), \ 
   [next_ip]  "m" (next->thread.ip), \ 

   /* regparm parameters for __switch_to(): */  \ 
   [prev] "a" (prev),   \ 
   [next] "d" (next)\ 

   __switch_canary_iparam   \ 

 : /* reloaded segment registers */ \ 
"memory");  \ 
} while (0)

/* 
 *  switch_to(x,yn) should switch tasks from x to y. 
 * 
 * We fsave/fwait so that an exception goes off at the right time 
 * (as a call from the fsave or fwait in effect) rather than to 
 * the wrong process. Lazy FP saving no longer makes any sense 
 * with modern CPU's, and this simplifies a lot of things (SMP 
 * and UP become the same). 
 * 
 * NOTE! We used to use the x86 hardware context switching. The 
 * reason for not using it any more becomes apparent when you 
 * try to recover gracefully from saved state that is no longer 
 * valid (stale segment register values in particular). With the 
 * hardware task-switch, there is no way to fix up bad state in 
 * a reasonable manner. 
 * 
 * The fact that Intel documents the hardware task-switching to 
 * be slow is a fairly red herring - this code is not noticeably 
 * faster. However, there _is_ some room for improvement here, 
 * so the performance issues may eventually be a valid point. 
 * More important, however, is the fact that this allows us much 
 * more flexibility. 
 * 
 * The return value (in %ax) will be the "prev" task after 
 * the task-switch, and shows up in ret_from_fork in entry.S, 
 * for example. 
 */ 
__notrace_funcgraph struct task_struct * 
__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 

struct thread_struct *prev = &prev_p->thread, 
 *next = &next_p->thread; 
int cpu = smp_processor_id(); 
struct tss_struct *tss = &per_cpu(init_tss, cpu);/*init_tss为一个per cpu变量*/ 
bool preload_fpu; 
 
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 
 
/* 
 * If the task has used fpu the last 5 timeslices, just do a full 
 * restore of the math state immediately to avoid the trap; the 
 * chances of needing FPU soon are obviously high now 
 */ 
preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; 
/*保存FPU寄存器*/ 
__unlazy_fpu(prev_p); 
 
/* we're going to use this soon, after a few expensive things */ 
if (preload_fpu) 
prefetch(next->xstate); 
 
/* 
 * Reload esp0. 
 */ 
 /*吧next_p->thread.esp0装入对应于本地cpu的tss的esp0 
字段;任何由sysenter汇编指令产生的从用户态 
到内核态的特权级转换将把这个地址拷贝到 
esp寄存器中*/ 
load_sp0(tss, next); 
 
/* 
 * Save away %gs. No need to save %fs, as it was saved on the 
 * stack on entry.  No need to save %es and %ds, as those are 
 * always kernel segments while inside the kernel.  Doing this 
 * before setting the new TLS descriptors avoids the situation 
 * where we temporarily have non-reloadable segments in %fs 
 * and %gs.  This could be an issue if the NMI handler ever 
 * used %fs or %gs (it does not today), or if the kernel is 
 * running inside of a hypervisor layer. 
 */ 
lazy_save_gs(prev->gs); 
 
/* 
 * Load the per-thread Thread-Local Storage descriptor. 
 */ 
 /*把next进程使用的县城局部存储(TLS)段装入本地CPU 
 的全局描述符表;三个段选择符保存在进程描述符 
 内的tls_array数组中*/ 
load_TLS(next, cpu); 
 
/* 
 * Restore IOPL if needed.  In normal use, the flags restore 
 * in the switch assembly will handle this.  But if the kernel 
 * is running virtualized at a non-zero CPL, the popf will 
 * not restore flags, so it must be done in a separate step. 
 */ 
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) 
set_iopl_mask(next->iopl); 
 
/* 
 * Now maybe handle debug registers and/or IO bitmaps 
 */ 
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || 
 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) 
__switch_to_xtra(prev_p, next_p, tss); 
 
/* If we're going to preload the fpu context, make sure clts 
   is run while we're batching the cpu state updates. */ 
if (preload_fpu) 
clts(); 
 
/* 
 * Leave lazy mode, flushing any hypercalls made here. 
 * This must be done before restoring TLS segments so 
 * the GDT and LDT are properly updated, and must be 
 * done before math_state_restore, so the TS bit is up 
 * to date. 
 */ 
arch_end_context_switch(next_p); 
 
if (preload_fpu) 
__math_state_restore();/*装载FPU寄存器*/ 
 
/* 
 * Restore %gs if needed (which is common) 
 */ 
if (prev->gs | next->gs) 
lazy_load_gs(next->gs); 
 
percpu_write(current_task, next_p); 
 
return prev_p; 

static inline void __unlazy_fpu(struct task_struct *tsk) 
{   /*包含在thread_info描述符的status字段中的
TS_USEDFPU标志。他表示进程在当前执行的过程中
是否使用过FPU/MMU/XMM寄存器*/ 
if (task_thread_info(tsk)->status & TS_USEDFPU) { 
/*由于tsk在这次执行中使用了FPU/MMX/SSE或
SSE2指令;因此内核必须保存相关的硬件
上下文*/ 
__save_init_fpu(tsk); 
stts(); 
} else 
tsk->fpu_counter = 0; 

static inline void __save_init_fpu(struct task_struct *tsk) 
{   /*如果CPU使用SSE/SSE2扩展,则*/ 
if (task_thread_info(tsk)->status & TS_XSAVE) 
xsave(tsk); 
else 
fxsave(tsk); 
 
clear_fpu_state(tsk); 
task_thread_info(tsk)->status &= ~TS_USEDFPU;/*重置TS_USEDFPU标志*/ 
}