小鸟学飞-红联Linux系统门户

我一直想，应该给自己的第一个帖子起个什么标题好，婉转的小鸟的歌声不期而至，于是就以这个为题吧。这段时间，我一直在学习LINUX内核，却如赵传的歌声那样，我是一只小小鸟，怎么飞也飞不高，期望能和大家一起探讨探讨这内核的奥妙。写这帖子就如同我上交的作业。
我想从进程的调度开始。以下是从MontaVista Linux预览版本上摘录的schedule（），就从这里开始吧。
代码在kenerl/Sched.c中。该版本采用2.4.XX内核。
/*
* schedule() is the main scheduler function.
*/
asmlinkage void schedule(void)
{
task_t *prev, *next;
runqueue_t *rq;
prio_array_t *array;
struct list_head *queue;
int idx;

if (unlikely(in_interrupt()))
BUG();

need_resched:
preempt_disable();
prev = current;
rq = this_rq();

release_kernel_lock(prev, smp_processor_id());
prepare_arch_schedule(prev);
prev->sleep_timestamp = jiffies;
spin_lock_irq(&rq->lock);

/*
* if entering from preempt_schedule, off a kernel preemption,
* go straight to picking the next task.
*/
if (unlikely(preempt_get_count() & PREEMPT_ACTIVE))
goto pick_next_task;

switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (unlikely(signal_pending(prev))) {
prev->state = TASK_RUNNING;
break;
}
default:
deactivate_task(prev, rq);
case TASK_RUNNING:
;
}
pick_next_task:
if (unlikely(!rq->nr_running)) {
#if CONFIG_SMP
load_balance(rq, 1);
if (rq->nr_running)
goto pick_next_task;
#endif
next = rq->idle;
rq->expired_timestamp = 0;
goto switch_tasks;
}

array = rq->active;
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
}

idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);

switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);

if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;

prepare_arch_switch(rq);

TRACE_SCHEDCHANGE(prev, next);

prev = context_switch(prev, next);
barrier();
rq = this_rq();
finish_arch_switch(rq);
} else
spin_unlock_irq(&rq->lock);
finish_arch_schedule(prev);

reacquire_kernel_lock(current);
preempt_enable_no_resched();
if (need_resched())
goto need_resched;
}
该内核虽然说是2.4的，但该schedule()更象是2.6版本的。以下是2.4.31版本的
/*
* 'schedule()' is the scheduler function. It's a very simple and nice
* scheduler: it's not perfect, but certainly works for most things.
*
* The goto is "interesting".
*
* NOTE!! Task 0 is the 'idle' task, which gets called when no other
* tasks can run. It can not be killed, and it cannot sleep. The 'state'
* information in task[0] is never used.
*/
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
struct task_struct *prev, *next, *p;
struct list_head *tmp;
int this_cpu, c;

spin_lock_prefetch(&runqueue_lock);

BUG_ON(!current->active_mm);
need_resched_back:
prev = current;
this_cpu = prev->processor;

if (unlikely(in_interrupt())) {
printk("Scheduling in interrupt\n");
BUG();
}

release_kernel_lock(prev, this_cpu);

/*
* 'sched_data' is protected by the fact that we can run
* only one process per CPU.
*/
sched_data = & aligned_data[this_cpu].schedule_data;

spin_lock_irq(&runqueue_lock);

/* move an exhausted RR process to be last.. */
if (unlikely(prev->policy == SCHED_RR))
if (!prev->counter) {
prev->counter = NICE_TO_TICKS(prev->nice);
move_last_runqueue(prev);
}

switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (signal_pending(prev)) {
prev->state = TASK_RUNNING;
break;
}
default:
del_from_runqueue(prev);
case TASK_RUNNING:;
}
prev->need_resched = 0;

/*
* this is the scheduler proper:
*/

repeat_schedule:
/*
* Default process to select..
*/
next = idle_task(this_cpu);
c = -1000;
list_for_each(tmp, &runqueue_head) {
p = list_entry(tmp, struct task_struct, run_list);
if (can_schedule(p, this_cpu)) {
int weight = goodness(p, this_cpu, prev->active_mm);
if (weight > c)
c = weight, next = p;
}
}

/* Do we need to re-calculate counters? */
if (unlikely(!c)) {
struct task_struct *p;

spin_unlock_irq(&runqueue_lock);
read_lock(&tasklist_lock);
for_each_task(p)
p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
read_unlock(&tasklist_lock);
spin_lock_irq(&runqueue_lock);
goto repeat_schedule;
}

/*
* from this point on nothing can prevent us from
* switching to the next task, save this fact in
* sched_data.
*/
sched_data->curr = next;
task_set_cpu(next, this_cpu);
spin_unlock_irq(&runqueue_lock);

if (unlikely(prev == next)) {
/* We won't go through the normal tail, so do this by hand */
prev->policy &= ~SCHED_YIELD;
goto same_process;
}

#ifdef CONFIG_SMP
/*
* maintain the per-process 'last schedule' value.
* (this has to be recalculated even if we reschedule to
* the same process) Currently this is only used on SMP,
* and it's approximate, so we do not have to maintain
* it while holding the runqueue spinlock.
*/
sched_data->last_schedule = get_cycles();

/*
* We drop the scheduler lock early (it's a global spinlock),
* thus we have to lock the previous process from getting
* rescheduled during switch_to().
*/

#endif /* CONFIG_SMP */

kstat.context_swtch++;
/*
* there are 3 processes which are affected by a context switch:
*
* prev == .... ==> (last => next)
*
* It's the 'much more previous' 'prev' that is on next's stack,
* but prev is set to (the just run) 'last' process by switch_to().
* This might sound slightly confusing but makes tons of sense.
*/
prepare_to_switch();
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
if (!mm) {
BUG_ON(next->active_mm);
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next, this_cpu);
} else {
BUG_ON(next->active_mm != mm);
switch_mm(oldmm, mm, next, this_cpu);
}

if (!prev->mm) {
prev->active_mm = NULL;
mmdrop(oldmm);
}
}

/*
* This just switches the register state and the
* stack.
*/
switch_to(prev, next, prev);
__schedule_tail(prev);

same_process:
reacquire_kernel_lock(current);
if (current->need_resched)
goto need_resched_back;
return;
}

邓西村于 2006-08-16 10:04:30发表:

arm进程调度中的上下文切换

前段时间，偷懒了。
上次到了cpu_switch_mm，它是：
#define cpu_switch_mm(pgd,tsk) cpu_set_pgd(__virt_to_phys((unsigned long)(pgd)))
上面引用到了：
#define cpu_set_pgd(pgd) processor.pgtable.set_pgd(pgd)
和
#define __virt_to_phys(vpage) ((vpage) - PAGE_OFFSET + PHYS_OFFSET) （摘自AT9200，所有ARM体系的都差不多）

processor是processor结构体的实例，processor定义如下，是个很负责的结构。
/*
* Don't change this structure - ASM code
* relies on it.
*/
extern struct processor {
/* MISC
* get data abort address/flags
*/
void (*_data_abort)(unsigned long pc);
/*
* check for any bugs
*/
void (*_check_bugs)(void);
/*
* Set up any processor specifics
*/
void (*_proc_init)(void);
/*
* Disable any processor specifics
*/
void (*_proc_fin)(void);
/*
* Special stuff for a reset
*/
volatile void (*reset)(unsigned long addr);
/*
* Idle the processor
*/
int (*_do_idle)(void);
/*
* Processor architecture specific
*/
struct { /* CACHE */
/*
* flush all caches
*/
void (*clean_invalidate_all)(void);
/*
* flush a specific page or pages
*/
void (*clean_invalidate_range)(unsigned long address, unsigned long end, int flags);
/*
* flush a page to RAM
*/
void (*_flush_ram_page)(void *virt_page);
} cache;

struct { /* D-cache */
/*
* invalidate the specified data range
*/
void (*invalidate_range)(unsigned long start, unsigned long end);
/*
* clean specified data range
*/
void (*clean_range)(unsigned long start, unsigned long end);
/*
* obsolete flush cache entry
*/
void (*clean_page)(void *virt_page);
/*
* clean a virtual address range from the
* D-cache without flushing the cache.
*/
void (*clean_entry)(unsigned long start);
} dcache;

struct { /* I-cache */
/*
* invalidate the I-cache for the specified range
*/
void (*invalidate_range)(unsigned long start, unsigned long end);
/*
* invalidate the I-cache for the specified virtual page
*/
void (*invalidate_page)(void *virt_page);
} icache;

struct { /* TLB */
/*
* flush all TLBs
*/
void (*invalidate_all)(void);
/*
* flush a specific TLB
*/
void (*invalidate_range)(unsigned long address, unsigned long end);
/*
* flush a specific TLB
*/
void (*invalidate_page)(unsigned long address, int flags);
} tlb;

struct { /* PageTable */
/*
* Set the page table
*/
void (*set_pgd)(unsigned long pgd_phys);
/*
* Set a PMD (handling IMP bit 4)
*/
void (*set_pmd)(pmd_t *pmdp, pmd_t pmd);
/*
* Set a PTE
*/
void (*set_pte)(pte_t *ptep, pte_t pte);
} pgtable;
} processor;
可以想象每个处理器都应该有一个processor实例与之对应。ARM的内存管理是怎样的呢？

phpjava 于 2006-08-11 16:48:29发表:

觉得很深，楼主加油

邓西村于 2006-08-09 22:49:22发表:

context_switch->switch_mm，不同的CPU有不同的switch_mm,以下是ARM的switch_mm
/*
* This is the actual mm switch as far as the scheduler
* is concerned. No registers are touched.
*/
static inline void
switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk, unsigned int cpu)
{
if (prev != next) {
cpu_switch_mm(next->pgd, tsk);
clear_bit(cpu, &prev->cpu_vm_mask);
}
set_bit(cpu, &next->cpu_vm_mask);
}

这段代码是在include/asm-arm/Mmu_context.h，其关键是cpu_switch_mm()，看代码：
#define cpu_switch_mm(pgd,tsk) cpu_set_pgd(__virt_to_phys((unsigned long)(pgd)))
这是来自cpu-multi32.h，还有两个不同的版本分别在CPU_single.h和CPU_Multi26.h中，代码
都差不多。

LINUX的I386的内存管理都说得挺多了，而ARM的内存管理是如何实现的呢？
我想看看。

邓西村于 2006-08-09 22:34:51发表:

接下来看看load_LDT(next)都干了些什么？
在asm-i386下Desc.h中找到了它的身影，也就是说，不同CPU这段代码也应该是不同的。
/*
* load one particular LDT into the current CPU
*/
static inline void load_LDT (struct mm_struct *mm)
{
int cpu = smp_processor_id();
void *segments = mm->context.segments;
int count = LDT_ENTRIES;

if (!segments) {
segments = &default_ldt[0];
count = 5;
}

set_ldt_desc(cpu, segments, count);
__load_LDT(cpu);
}

#define __load_LDT(n) __asm__ __volatile__("lldt %%ax"::"a" (__LDT(n)<<3))
而如果是ARM为CPU的话，应该又是怎样的呢？

邓西村于 2006-08-08 12:02:28发表:

static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
{
if (likely(prev != next)) { /*如果prev与next不等，就需要切换 */
/* stop flush ipis for the previous mm */
clear_bit(cpu, &prev->cpu_vm_mask);
/*
* Re-load LDT if necessary
*/
if (unlikely(prev->context.segments != next->context.segments))
load_LDT(next);
#ifdef CONFIG_SMP
cpu_tlbstate[cpu].state = TLBSTATE_OK;
cpu_tlbstate[cpu].active_mm = next;
#endif
set_bit(cpu, &next->cpu_vm_mask);
set_bit(cpu, &next->context.cpuvalid);
/* Re-load page tables */
load_cr3(next->pgd);
}
#ifdef CONFIG_SMP
else { /*此处prev和next相等，即还是那个任务，由此不需要切换了*/
cpu_tlbstate[cpu].state = TLBSTATE_OK;
if(cpu_tlbstate[cpu].active_mm != next)
out_of_line_bug();
if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload %cr3.
*/
load_cr3(next->pgd);
}
if (!test_and_set_bit(cpu, &next->context.cpuvalid))
load_LDT(next);
}
#endif
}

cpu_vm_mask在这里出现了多次，有什么用呢？以下一段文字有了个大概的说明：
cpu_vm_mask should simply and only tell on which CPU a certain
mm_struct is running on. It should tell on which CPUs current->active_mm
is equal to the mm where we reading mm->cpu_vm_mask.
cpu_vm_mask应能够简单而且唯一地说明，某个mm_struct正运行在哪个CPU上。
它应该说明在多CPU上，哪个CPU上有那个current->active_mm所指的mm，
它等于我们正在读取的mm->cpu_vm_mask中确定的那个mm。
This seems to me a sane semantic and it's easy to enforce it by touching
the cpu-mask bit only while switching_mm in the scheduler code.

The only detail to care about is to flush in memory the cpu_vm_mask
changes before switching pgd and first writing the page-table
changes and then reading the cpu_vm_mask. If some MM will switch CPU while
we are doing the flush that's not a problem because while changing %%cr3
such CPU will automatically flush the TLB.
唯一需要注意的细节是内存的刷新，cpu_vm_mask在切换pgd之前改变，然后第一次写
page_table变化，然后读取cpu_vm_mask.如果在我们正在刷新时有些MM要切换CPU，
这并不是什么问题，因为当改变这个CPU的%%cr3时，它会自动刷新TLB表。

我又断章取意地摘了一段：
inside schedule(), calling switch_mm() clear cpu_vm_mask bit for current mm, trying
stop ipi for flushing tlb。
ipi即（Inter-Processor Interrupt 处理器间中断），
An Inter-Processor Interrupt (IPI) is a special type of interrupt by which one processor may interrupt another processor in a multiprocessor system.
至于如何刷新TLB表，我想迟些再研究。

不知翻译得对不对，请大家指正。

[ 本帖最后由邓西村于 2006-8-9 22:06 编辑 ]

邓西村于 2006-08-03 17:15:35发表:

/*
* context_switch - switch to the new MM and the new
* thread's register state.
*/
static inline task_t * context_switch(task_t *prev, task_t *next)
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;

if (unlikely(!mm)) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next, smp_processor_id());
} else
switch_mm(oldmm, mm, next, smp_processor_id());

if (unlikely(!prev->mm)) {
prev->active_mm = NULL;
mmdrop(oldmm);
}

/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);

return prev;
}

context_switch完成新旧任务（prev、next）上下文的切换。在task_struct中有两个mm_struct类型指针，
mm和active_mm，它们在使用时有什么不同呢？以下为一段在网上找的解释
前者指向进程所拥有的内存区域，后者则指向进程所实际使用的内存。对于大多数进程，mm和active_mm是相同的，但核心线程没有自主的内存，它们的mm指针永远为NULL。我们知道，任何进程的虚页表中，核心空间永远映射到了虚存的高端固定位置，所以，核心线程所使用的内存无论对于哪个进程空间都是一样的，所以也就没有必要切换进程的内存。在调度器中，只要判断一下next->mm是否为空就能知道该进程是不是核心线程，如果是，则继续使用prev的active_mm（next->active_mm = prev->active_mm），并通过设置cpu_tlbstate[cpu].state为TLBSTATE_LAZY，告诉内存管理部件不要刷新TLB；否则就调用switch_mm()函数进行内存的切换（具体过程牵涉到内存管理模块的知识，这里就从略了）。实际上，在switch_mm()中还会对prev->active_mm和next->mm判断一次，如果两值相等，说明两个进程是同属于一个"进程"的两个"线程"（实际上是轻量进程），此时也不需要执行内存的切换，但这种情况TLB还是需要刷新的。
设置好next的内存环境以后，就可以调用mmdrop()释放掉prev的内存结构了。所有不在运行中的进程，其active_mm属性都应该为空。
这段解释，基本上说明了上面代码的作用。

附注：

在i386中：
#ifdef CONFIG_SMP

static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
{
if(cpu_tlbstate[cpu].state == TLBSTATE_OK)
cpu_tlbstate[cpu].state = TLBSTATE_LAZY;
}
#else
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
{
}
#endif

而在ARM中，有：
/*
* This is called when "tsk" is about to enter lazy TLB mode.
*
* mm: describes the currently active mm context
* tsk: task which is entering lazy tlb
* cpu: cpu number which is entering lazy tlb
*
* tsk->mm will be NULL
*/
static inline void
enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
{
}
TLB（TRANSLATION LOOK SIDE BUFFER）是MMU的一个存储“虚拟地址----物理地址对”的高速缓存。它一般用于存放最近访问的虚拟地址和与其对应的物理地址对。

[ 本帖最后由邓西村于 2006-8-3 17:50 编辑 ]

邓西村于 2006-08-03 11:46:51发表:

又到做作业的时候了，现在到了switch_tasks，先看看下面两句：
switch_tasks:
prefetch(next);
clear_tsk_need_resched(prev);
prefetch在干什么呢？以下有一段英文进行解释：
prefetch(x) attempts to pre-emptively get the memory pointed to by address "x" into the CPU L1
cache.
这是一个将next指针指向的地址读入缓存的操作。对理解schedule没有太大的影响。

clear_tsk_need_resched(prev)干什么呢？看代码：
static inline void clear_tsk_need_resched(task_t *tsk)
{
tsk->need_resched = 0;
}

在task_struct中need_resched有什么用？我在网上找了些解释。
（1）只需要设置当前进程（current）的need_resched，就有机会启动调度器。由于启动schedule()的时机实际上由当前进程决定，因此设置了need_resched并不意味着就能及时调度，这也是"Linux内核不可抢占"的原因。
（2）通常有如下几种场合会设置need_resched：
（a）update_process_times()，由时钟中断触发，负责管理除0号进程（idle进程）以外的其他各个进程的时间片消耗。如果当前进程（SCHED_FIFO实时进程除外）的时间片用完了（counter==0），则设置need_resched为1；--------如果是这样，在这里解释为什么要置零是合适的。
（b）reschedule_idle()，唤醒操作，但唤醒调度最经常的调用者是在某一事件等待队列上休眠的进程的唤醒过程--wake_up_process()及其他一系列wake_up函数
（c）sched_setscheduler()、sched_yield()系统调用，以及系统初始化（rest_init()中）、创建新进程（do_fork()中）等从语义上就希望启动调度器工作的场合。
后面（b）、（c）所使用的调用函数，在我这个版本上没有发现。但基本可以定调了，need_resched是在进程需要调度是设置的，在调度时将它清零。

接下来是真正的切换了：
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;

prepare_arch_switch(rq);

TRACE_SCHEDCHANGE(prev, next);

prev = context_switch(prev, next);
barrier();
rq = this_rq();
finish_arch_switch(rq);
} else
spin_unlock_irq(&rq->lock);
rq就是runqueue（运行队列），nr_switches加1，curr设置为next，然后prepare_arch_switch(rq)。看看
prepare_arch_switch(rq)的代码：
/*
* Default context-switch locking:
*/
#ifndef prepare_arch_schedule
# define prepare_arch_switch(rq) do { } while(0)
#endif
这是本文件前面找到的，发现什么也没做。倒是在asm-sparc64中有：
#define prepare_arch_switch(rq, next) \
do { spin_lock(&(next)->switch_lock); \
spin_unlock(&(rq)->lock); \
flushw_all(); \
} while (0)
可见并不是所有的CPU都需要做这个事情。

接下来是TRACE_SCHEDCHANGE(prev, next)，看代码：
/* TRACE_SCHEDCHANGE */
typedef struct _trace_schedchange
{
uint32_t out; /* Outgoing process */
uint32_t in; /* Incoming process */
uint32_t out_state; /* Outgoing process' state */
} LTT_PACKED_STRUCT trace_schedchange;
#define TRACE_SCHEDCHANGE(OUT, IN) \
do \
{\
trace_schedchange sched_event;\
sched_event.out = OUT->pid;\
sched_event.in = (uint32_t) IN;\
sched_event.out_state = OUT->state; \
trace_event(TRACE_EV_SCHEDCHANGE, &sched_event);\
} while(0)

这个是用于跟踪事件(trace_event)用的，此处是TRACE_EV_SCHEDCHANGE，但跟踪它有什么用呢？我还不知道呢。
接下来是prev = context_switch(prev, next);这是真正的切换。

秦靖于 2006-08-02 11:51:24发表:

很看不懂，向楼主学习

邓西村于 2006-07-31 14:47:33发表:

（续上节）
queue本身就是list_head类型指针。list_head定义如下：
struct list_head {
struct list_head *next, *prev;
};
在linux中，list_head是作为其他数据结构的一部分，嵌入到需要构成链表的单元中，就如同该数据结构的一个组件。list_entry宏就就从这个链表指针获得整个数据结构，如下图：
task_struct:
-----------------
| | \
| | &((task_t*)0)->run_list
|----------------| /
queue->next --->| run_list |
|----------------- |
|__________|

[ 本帖最后由邓西村于 2006-7-31 14:48 编辑 ]

邓西村于 2006-07-30 17:03:31发表:

又到做作业的时候了。^_^
schedule()的代码到了：
release_kernel_lock(prev, smp_processor_id());
prepare_arch_schedule(prev);
prev->sleep_timestamp = jiffies;
spin_lock_irq(&rq->lock);

先是release_kernel_lock，其代码如下：
/*
* Release global kernel lock and global interrupt lock
*/
#define release_kernel_lock(task, cpu) \
do { \
if (task->lock_depth >= 0) \
spin_unlock(&kernel_flag); \
release_irqlock(cpu); \
__sti(); \
} while (0)
打开了内核锁，而且允许中断。其实schedule()是系统调用，相当于软中断，但其毕竟优先级比硬件中断低。接下来执行prepare_arch_schedule()，其下是它的代码：# define prepare_arch_schedule(prev) do { } while(0)，好像是什么也没有做。？其解释如下：Default context-switch locking。大概是不用吧。接下来将runqueue的自旋锁锁上，看下面代码：
#define spin_lock_irq(lock) do { local_irq_disable(); spin_lock(lock); } while (0)
以下将操作运行队列runqueue，所以将其锁上锁，而且不运许其中断。对于I386有：
#define local_irq_disable() __cli()

最后说说prev->sleep_timestamp = jiffies;
将jiffies保存在即将换出的进程的sleep_timestamp 中。就jiffies，它是一个long形变量，系统开机后每10毫秒增加一，long形的最大值为为2^32。如此换算，系统运行497.1天之后，jiffies的值将会恢复为0。所以在调度过程中，jiffies是不会重复的，以此操作目的是记录换出时的时间。

看schedule()接下来是
/*
* if entering from preempt_schedule, off a kernel preemption,
* go straight to picking the next task.
*/
if (unlikely(preempt_get_count() & PREEMPT_ACTIVE))
goto pick_next_task;
如果是抢占式调度，则直接跳转至pick_next_task。这个迟点再看。然后是：
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (unlikely(signal_pending(prev))) {
prev->state = TASK_RUNNING;
break;
}
default:
deactivate_task(prev, rq);
case TASK_RUNNING:
;
}
其实这里的prev还是进入调度前的current（前面有prev = current;）。它的状态应该是什么呢？
是TASK_INTERRUPTIBLE，还是TASK_RUNNING，或是其他。看看deactivate_task：
/*
* deactivate_task - remove a task from the runqueue.
*/
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
rq->nr_running--;
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
dequeue_task(p, p->array);
p->array = NULL;
}
这个函数是要将prev这个进程从运行队列中删除。
这段代码应该这样理解：
如果发现进程处于TASK_INTERRUPTIBLE状态且有信号等待处理，则内核将其状态设为TASK_RUNNING，让其处理完信号，接下来仍有机会获得CPU；如果没有信号等待，则将其从可运行队列中撤下来；如果处于TASK_RUNNING状态，则继续进行。否则，将该进程从运行队列runqueue中删除。

接下来是pick_next_task：
pick_next_task:
if (unlikely(!rq->nr_running)) {
#if CONFIG_SMP
load_balance(rq, 1);
if (rq->nr_running)
goto pick_next_task;
#endif
next = rq->idle;
rq->expired_timestamp = 0;
goto switch_tasks;
}

array = rq->active;
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
}

idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
switch_tasks:

我想先看看单CPU的情况，即：
if (unlikely(!rq->nr_running)) {
next = rq->idle;
rq->expired_timestamp = 0;
goto switch_tasks;
}
检查该CPU上的runqueue中是否还有进程处在TASK_RUNNING状态，如果没有则直接去执行切换。如果还有，则进行以下工作：
(1)将运行队列rq的active队列交给array，然后检查active状态任务的数量，如果为0时，说明所有的任务都已经将其运行时间片消耗完了。此时，需要将在耗尽队列中的队列交换给active队列，以下代码就是完成如下工作的，其实，在前面的帖子里也已经见过这段代码。
array = rq->active;
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
}
(2)接下来的代码是：
idx = sched_find_first_bit(array->bitmap);
queue = array->queue + idx;
next = list_entry(queue->next, task_t, run_list);
这个sched_find_first_bit何用？
sched_find_first_bit,它用于快速定位就绪队列（runqueue）中高优先级任务，在水木清华站有个帖子，好像对解释这个问题有些用，简录如下：
runqueue中最重要的部分是prio_array_t *active, *expired, arrays[2]。每个CPU的就绪队列(runqueue)都是一个数组，按照时间片是否用完将就绪队列分为两个部分，分别用指针active和expired来指向数组的两个下标。这个数组就是arrays[2]。prio_array_t的结构如下：
#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
struct prio_array {
int nr_active; /*本进程组中进程个数*/
struct list_head queue[MAX_PRIO]; /*每个优先级的进程队列*/
unsigned long bitmap[BITMAP_SIZE]; /*上述进程队列的索引位图*/
};
数组queue[MAX_PRIO]里面存放的是优先级为i（MAX_PRIO>i>=0）的进程队列的链表头，即
task_struct::runlist（通过runnlist即可找到task_struct）。
那么调度器在执行调度的任务时是怎么找到优先级最高的进程呢？
在结构体struct prio_array中有一个重要的数据unsigned long bitmap[BITMAP_SIZE]，这个数据是用来作为进程队列queue[MAX_PRIO]的索引位图，bitmap的每一位（bit）都与queue[i]对应。当queue[i]的进程队列不为空时，bitmap的相应位就为1；否则就为0。这样我们只需要通过汇编指令从进程优先级由高到低的方向找到第一个为1的位置idx，即为当前就绪队列中最高的优先级（函数sched_find_first_bit()就是用来完成这一工作的），那么queue[i]->next就是我们要找的task_struct::runlist。

回到我们的代码，看看其轨迹：
array = rq->active; -> idx = sched_find_first_bit(array->bitmap); -> queue = array->queue + idx;
如此就找到了优先级较高的进程队列，然后取出要切换到的那个next进程，即：
next = list_entry(queue->next, task_t, run_list);
list_entry的原型如下：
/**
* list_entry - get the struct for this entry
* @ptr: the &struct list_head pointer.
* @type: the type of the struct this is embedded in.
* @member: the name of the list_struct within the struct.
*/
#define list_entry(ptr, type, member) \
((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
代入宏则变成：
((task_t*)((char *)queue->next - (unsigned long)(&((task_t *)0)->run_list)))
这种链表的方法还是很有借鉴意义的。

邓西村于 2006-07-30 12:13:25发表:

这里碰到了runqueue，先看看其定义：
/*
* This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
* (such as the load balancing or the process migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
struct runqueue {
spinlock_t lock;
unsigned long nr_running, nr_switches, expired_timestamp,
nr_uninterruptible;
task_t *curr, *idle;
prio_array_t *active, *expired, arrays[2];
int prev_nr_running[NR_CPUS];
task_t *migration_thread;
struct list_head migration_queue;
} ____cacheline_aligned;

static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
#define cpu_rq(cpu) (runqueues + (cpu))
#define this_rq() cpu_rq(smp_processor_id())
其中有#define smp_processor_id() (current->cpu)，从这里可以看到schedule()->this_rq()，即得到当前调度进程的cpu的runqueue。runqueue有何用呢？先看看其成员：
（1）spinlock_t lock 自旋锁，应该是用于进程互斥之用。
（2）nr_running 处于TASK_RUNNING的进程数
（3）task_t *curr, *idle; /* 该CPU的当前进程和空闲进程*/
（4）prio_array_t *active, *expired, arrays[2]; /* 活跃和过期prio_array */
/* arrays[2]为他们实际占用的空间 */
对它们的解释如下：（摘录）
这里最关键的成员是两个prio_array_t, 以及两个此类型的指针。其中active指针指向活跃进程的优先级数组，而expired指向过期进程的优先级数组。当一个进程时间片用完并且没有立刻重新激活的特权时，只需重新计算时间片，并且将它移入expired优先级数组即可。在active数组为空时，只要将两个指针交换，由于expired数组中的时间片都已计算好，只要放到active的位置立刻可以执行，而空空如也的active数组则恰好充当新的expired数组。
这里说的是schedule（）后面部分的处理情况，如下：
if (unlikely(!array->nr_active)) {
/*
* Switch the active and expired arrays.
*/
rq->active = rq->expired;
rq->expired = array;
array = rq->active;
rq->expired_timestamp = 0;
}
这个等等再说。
有一段英文对处于TASK_RUNNING的进程描述揭示了runqueue的本质：
TASK_RUNNING
The process is runnable. It's either currently running or on a runqueue waiting to run. This is the only
possible state for a process executing in user-space. It can also apply to a process in kernel-space that is
actively running.
可见runqueue是用来对处于TASK_RUNNING状态的进程进行排队的一个机制。再看一段英文，当然也是摘录的：
All three OSes favor interactive threads/processes. Interactive threads run at better priority than compute-bound threads, but tend to run for shorter time slices. Solaris, FreeBSD, and Linux all use a per-CPU "runqueue." FreeBSD and Linux use an "active" queue and an "expired" queue. Threads are scheduled in priority from the active queue. A thread moves from the active queue to the expired queue when it uses up its time slice (and possibly at other times to avoid starvation). When the active queue is empty, the kernel swaps the active and expired queues. FreeBSD has a third queue for "idle" threads. Threads run on this queue only when the other two queues are empty. Solaris uses a "dispatch queue" per CPU. If a thread uses up its time slice, the kernel gives it a new priority and returns it to the dispatch queue. The "runqueues" for all three OSes have separate linked lists of runnable threads for different priorities. (Though FreeBSD uses one list per four priorities, both Solaris and Linux use a separate list for each priority.)
在runqueue中存在着两个队列，一个是active（活跃），另一个是expire（过期）。当线程/进程用完其时间片后将从active队列中进入expire队列，调度应该是调度active队列中的进程，expire队列中进程则处在等待的状况，当active队列空了，这时所有TASK_RUNNING状态的进程都应该在expire队列中，将expire队列中的进程再换入active，就开始了新的一轮调度。
在schedule()中调用rq = this_rq()，是要获得当前CPU的runqueue，以便进行调度。

邓西村于 2006-07-30 11:30:25发表:

在网上看到一篇关于实时LINUX的论文，其中一部分还是很有意思的，现摘录下来：
Hardhat Linux是MontaVista公司所发布的一款主要面向各种嵌入式应用的Linux发布[HardHatWeb][Morgan01]。Hard-hat Linux最大的贡献在于：为了解决Linux在内核态不可被抢占的问题，它开发了一种抢占式（Preemptible）的内核，有人认为它的这种方法充其量也就是一种能够被抢占（Preemptable）的内核。

其基本思想就是让调度程序获得更多的执行机会，从而减少了从一个事件发生到调度程序被执行的时间间隔。可抢占内核的补丁包修改了spinlock的宏定义以及中断返回处理代码，当当前进程可以被"安全"地抢占并且有一个等待处理的重新调度请求，系统就会调用调度程序进行进程调度。

那么什么情况下可以认为一个进程可以被"安全"地抢占？最早的Linux内核代码认为，一旦进入内核态执行，不管是由于陷入（trap）还是中断处理，当前执行进程都不会被切换，直到内核认为可以安全地进行重新调度为止。这种思想可以使得内核代码对一些数据结构进行操作时比较简单，即不需要使用互斥原语（比如旋转锁spinlock）进行数据的修改保护就可以安全地存取数据。但随着内核源代码的发展，不使用保护机制的内核数据访问代码越来越少，所以在抢占式内核中，认为如果内核不是在一个中断处理程序中，并且不再spinlock保护的代码中，就认为可以"安全"地进行进程切换。

抢占式内核对普通Linux内核作了如下的一些修改：

抢占式内核给task struct数据结构增加了一个数据项：preempt_count。该数据项由宏preempt_disable()、preempt_enable()、以及preempt_enable_no_resched()所使用。preempt_disable对preempt_count计数进行递增，preempt_enable对preempt_count进行递减。preempt_enable宏查看当前进程的preempt_count和need_resched域的内容，如果 preempt_count为0并且need_resched为1，则调用preempt_schedule()函数。该函数将给当前进程的preempt_count项增加一个很大的值（比如让preempt_counter=preempt_counter + 0x4000000），然后调用进程调度函数schedule()，在schedule函数返回后从该进程preempt_count中再减去该值。可抢占内核也修改了schedule函数，它检测进程的preempt_counter是否很大（这是为了屏蔽一些普通调度流程中对于抢占式调度来说是冗余的那些操作），然后执行抢占式调度。
抢占式内核补丁也修改了spinlock的代码。在spin_lock()和spin_try_lock中增加了对于preempt_disable的调用，在spin_unlock()中增加了对于preempt_enable的调用。
另外抢占式内核补丁还修改了中断返回的代码，在其中增加了对于preempt_enable的调用。
所以我们根据上面的三个修改可以看出，内核的抢占式调度发生在如下情况：在释放spinlock时，或者当中断返回时，如果当前执行进程的need_resched被标记，则进行抢占式调度。

摘自孙焕强，《基于LINUX的实时系统》

邓西村于 2006-07-30 10:56:37发表:

为什么要先增加preempt_count(代码++current->preempt_count)呢？看看preempt_lock_start：
#define preempt_lock_start(c) \
do { \
if ((preempt_get_count() & ~PREEMPT_ACTIVE) == 1) \
latency_start(__BASE_FILE__, __LINE__, c); \
} while (0)
解释一下：如果抢占计数preempt_count为1，而且PREEMPT_ACTIVE为0则，执行latency_start。然而latency_start这段我还没看明白有什么用，怎么用。代码如下：
asmlinkage void latency_start(const char *fname,unsigned lineno, int cause)
{
struct LatencyData *l = &latencyData[smp_processor_id()];

/* if we are not logging or we have an error, do nothing */
if ((logFlag == 0) || (panicFlag != 0))
return;

if (l->syncFlag != SfC) {
readclock(l->latencyStartCount);
l->startFlag = 1;
l->mask = 0;
l->theCause = cause;
l->syncFlag = SfC;
l->latencyStart.Address = (int)__builtin_return_address(0);
l->latencyStart.FileName = fname;
l->latencyStart.FileLine = lineno;
l->latencyStart.ProcId = current->pid;
memcpy(l->latencyStart.ProcName, current->comm, 16);
CLEAR(intrCount);
}
}

字面上是“延迟开始”，在百度上搜索无所得，但有对preempt_count的描述，即2.4内核为细化多CPU下内核线程同步机构，在task_struct中加入了一个内核抢占锁preempt_count。只有当它为0时表示可以进行内核调度。
其实，preempt_count不应该叫作锁，因为它仅仅是个int类型的数据。
回到schedule():
need_resched:
preempt_disable();
prev = current;
rq = this_rq();
preempt_disable()即让抢占失效，然后将current赋给prev，rq被赋予this_rq()，完成交换时的替换(A->B,C->A,B->C)

依依杨柳于 2006-07-30 09:28:16发表:

厉害
还自嘲是小鸟！
很牛！

厉烨于 2006-07-29 20:43:00发表:

牛，支持

fengmayi1 于 2006-07-29 12:43:38发表:

头都看大了，向楼主学习

邓西村于 2006-07-29 12:33:05发表:

我将从MVL的版本开始。
if (unlikely(in_interrupt()))
BUG();
这个in_interrupt()是用于判断是否处于中断上下文，不同的CPU就有不同的判断方法，但其方法前的注释还是很能说明问题的，以下是摘自include/asm-arm/hardirq.h文件中
/*
* Are we in an interrupt context? Either doing bottom half
* or hardware interrupt processing?
*/
#define in_interrupt() ({ const int __cpu = smp_processor_id(); \
(local_irq_count(__cpu) + local_bh_count(__cpu) != 0); })
它的意思我不想去深究，我想还是先回到schedule()。unlike()对这个结果进行判断，unlike()是什么意思呢？看下面代码：在include/linux/compiler.h中有
#define unlikely(x) __builtin_expect((x),0)
还有一段解释：
/* Somewhere in the middle of the GCC 2.96 development cycle, we implemented
a mechanism by which the user can annotate likely branch directions and
expect the blocks to be reordered appropriately. Define __builtin_expect
to nothing for earlier compilers. */
#if __GNUC__ == 2 && __GNUC_MINOR__ < 96
#define __builtin_expect(x, expected_value) (x)
#endif
也就是说使用 __builtin_expect来处理先前的编译器。完成这一段宏之后，将if(__cpu)，如果发生了中断，则__cpu应该是1，即将执行BUG()，即出错了。从这里看，在进行schudule调度时，是应该不在中断中的，也即在调用中断服务子程序时，是不能进行调度的。其实，这也是合理的，因为调度的优先级应该比中断处理的低一些。
接下来，应该是进行调度的准备了。
need_resched:
preempt_disable();
prev = current;
rq = this_rq();
preempt_disable()在做什么呢？在include/linux/Spinlock.h中可以找到它的定义
#define preempt_disable() \
do { \
++current->preempt_count; \
barrier(); \
preempt_lock_start(1); \
} while (0)

先看barrier(),在include/linux/kernel.h可以找到#define barrier() __asm__ __volatile__("": : :"memory")，对于它的意义，可以看http://linux.chinaunix.net/jh/4/683018.html，大致意思是使用memory强制CPU将registers和cache中缓存数据作废，这是GCC的一个优化。

redapp 于 2006-07-29 11:46:48发表:

好深奥，向楼主致敬

小鸟学飞

共有 18 条评论

频道文章

最新教程

随机推荐