Linux内核抢占补丁的基本原理

tznktg · 发表于 2007-9-8 22:33:26

CPU在内核中运行时并不是处处不可抢占的，内核中存在一些空隙，在这时进行抢占是安
全的，内核抢占补丁的基本原理就是将SMP可并行的代码段看成是可以进行内核抢占的区
域。
2.4内核正好细化了多CPU下的内核线程同步机构，对不可并行的指令块用spinlock和rw
lock作了细致的表示，该补丁的实现可谓水到渠成。
具体的方法就是在进程的任务结构上增加一个preempt_count变量作为内核抢占锁，它随
着spinlock和rwlock一起加锁和解锁。当preempt_count为0时表示可以进行内核调度。
内核调度器的入口为preempt_schedule()，它将当前进程标记为TASK_PREEMPTED状态再
调用schedule()，在TASK_PREEMPTED状态，schedule()不会将进程从运行队列中删除。

下面是内核抢占补丁的主要代码示意：
arch/i386/kernel/entry.S:
preempt_count =  4 # 将task_struct中的flags用作preempt_count,flags被移到了别
的位置
ret_from_exception: # 从异常返回
#ifdef CONFIG_SMP
GET_CURRENT(%ebx)
movl processor(%ebx),%eax
shll $CONFIG_X86_L1_CACHE_SHIFT,%eax
movl SYMBOL_NAME(irq_stat)(,%eax),%ecx  # softirq_active
testl SYMBOL_NAME(irq_stat)+4(,%eax),%ecx # softirq_mask
#else
movl SYMBOL_NAME(irq_stat),%ecx  # softirq_active
testl SYMBOL_NAME(irq_stat)+4,%ecx # softirq_mask
#endif
jne handle_softirq
#ifdef CONFIG_PREEMPT
cli
incl preempt_count(%ebx) # 异常的入口没有禁止内核调度的指令,与ret_from_intr
匹配一下
#endif
ENTRY(ret_from_intr) # 硬件中断的返回
GET_CURRENT(%ebx)
#ifdef CONFIG_PREEMPT
cli
decl preempt_count(%ebx) # 恢复内核抢占标志
#endif
movl EFLAGS(%esp),%eax  # mix EFLAGS and CS
movb CS(%esp),%al
testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor?
jne ret_with_reschedule
#ifdef CONFIG_PREEMPT
cmpl $0,preempt_count(%ebx)
jnz restore_all # 如果preempt_count非零则表示禁止内核抢占
cmpl $0,need_resched(%ebx)
jz restore_all #
movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx
addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx
jnz restore_all
incl preempt_count(%ebx)
sti
call SYMBOL_NAME(preempt_schedule)
jmp ret_from_intr # 新进程返回,返回ret_from_intr恢复抢占标志后再返回
#else
jmp restore_all
#endif
ALIGN
handle_softirq:
#ifdef CONFIG_PREEMPT
cli
GET_CURRENT(%ebx)
incl preempt_count(%ebx)
sti
#endif
call SYMBOL_NAME(do_softirq)
jmp ret_from_intr
ALIGN
reschedule:
call SYMBOL_NAME(schedule) # test
jmp ret_from_sys_call
include/asm/hw_irq.h:
...
#ifdef CONFIG_PREEMPT
#define BUMP_CONTEX_SWITCH_LOCK \
GET_CURRENT \
"incl 4(%ebx)\n\t"
#else
#define BUMP_CONTEX_SWITCH_LOCK
#endif
#define SAVE_ALL \ 硬件中断保护入口现场
"cld\n\t" \
"pushl %es\n\t" \
"pushl %ds\n\t" \
"pushl %eax\n\t" \
"pushl %ebp\n\t" \
"pushl %edi\n\t" \
"pushl %esi\n\t" \
"pushl %edx\n\t" \
"pushl %ecx\n\t" \
"pushl %ebx\n\t" \
"movl $" STR(__KERNEL_DS) ",%edx\n\t" \
"movl %edx,%ds\n\t" \
"movl %edx,%es\n\t" \
BUMP_CONTEX_SWITCH_LOCK # 硬件中断的入口禁止内核抢占
include/linux/spinlock.h:
#ifdef CONFIG_PREEMPT
#define switch_lock_count() current->preempt_count
#define in_ctx_sw_off() (switch_lock_count().counter) 判断当前进程的抢占计数
是否非零
#define atomic_ptr_in_ctx_sw_off() (&switch_lock_count())
#define ctx_sw_off() \ 禁止内核抢占
do { \
atomic_inc(atomic_ptr_in_ctx_sw_off());  \ 当前进程的内核抢占计数增1
} while (0)
#define ctx_sw_on_no_preempt() \ 允许内核抢占
do { \
atomic_dec(atomic_ptr_in_ctx_sw_off()); \ 当前进程的内核抢占计数减1
} while (0)
#define ctx_sw_on() \ 允许并完成内核抢占
do { \
if (atomic_dec_and_test(atomic_ptr_in_ctx_sw_off()) && \
   current->need_resched) \
  preempt_schedule(); \
} while (0)
#define spin_lock(lock) \
do { \
ctx_sw_off(); \ 进入自旋锁时禁止抢占
_raw_spin_lock(lock); \
} while(0)
#define spin_trylock(lock) ({ctx_sw_off(); _raw_spin_trylock(lock) ? \锁定并
测试原来是否上锁
   1 : ({ctx_sw_on(); 0;});})
#define spin_unlock(lock) \
do { \
_raw_spin_unlock(lock); \
ctx_sw_on(); \ 离开自旋锁时允许并完成内核抢占
} while (0)
#define read_lock(lock)  ({ctx_sw_off(); _raw_read_lock(lock);})
#define read_unlock(lock) ({_raw_read_unlock(lock); ctx_sw_on();})
#define write_lock(lock) ({ctx_sw_off(); _raw_write_lock(lock);})
#define write_unlock(lock) ({_raw_write_unlock(lock); ctx_sw_on();})
#define write_trylock(lock) ({ctx_sw_off(); _raw_write_trylock(lock) ? \
   1 : ({ctx_sw_on(); 0;});})
...
include/asm/softirq.h:
#define cpu_bh_disable(cpu) do { ctx_sw_off(); local_bh_count(cpu)++; barrie
r(); } while (0)
#define cpu_bh_enable(cpu) do { barrier(); local_bh_count(cpu)--;ctx_sw_on()
; } while (0)
kernel/schedule.c:
#ifdef CONFIG_PREEMPT
asmlinkage void preempt_schedule(void)
{
while (current->need_resched) {
  ctx_sw_off();
  current->state |= TASK_PREEMPTED;
  schedule();
  current->state &= ~TASK_PREEMPTED;
  ctx_sw_on_no_preempt();
}
}
#endif
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
struct task_struct *prev, *next, *p;
struct list_head *tmp;
int this_cpu, c;
#ifdef CONFIG_PREEMPT
ctx_sw_off();
#endif
if (!current->active_mm) BUG();
need_resched_back:
prev = current;
this_cpu = prev->processor;
if (in_interrupt())
  goto scheduling_in_interrupt;
release_kernel_lock(prev, this_cpu);
/* Do "administrative" work here while we don't hold any locks */
if (softirq_active(this_cpu) & softirq_mask(this_cpu))
  goto handle_softirq;
handle_softirq_back:
/*
  * 'sched_data' is protected by the fact that we can run
  * only one process per CPU.
  */
sched_data = & aligned_data[this_cpu].schedule_data;
spin_lock_irq(&runqueue_lock);
/* move an exhausted RR process to be last.. */
if (prev->policy == SCHED_RR)
  goto move_rr_last;
move_rr_back:
switch (prev->state) {
  case TASK_INTERRUPTIBLE:
if (signal_pending(prev)) {
prev->state = TASK_RUNNING;
break;
}
  default:
#ifdef CONFIG_PREEMPT
if (prev->state & TASK_PREEMPTED)
break; 如果是内核抢占调度,则保留运行队列
#endif
del_from_runqueue(prev);
#ifdef CONFIG_PREEMPT
  case TASK_PREEMPTED:
#endif
  case TASK_RUNNING:
}
prev->need_resched = 0;
/*
  * this is the scheduler proper:
  */
repeat_schedule:
/*
  * Default process to select..
  */
next = idle_task(this_cpu);
c = -1000;
if (task_on_runqueue(prev))
  goto still_running;
still_running_back:
list_for_each(tmp, &runqueue_head) {
  p = list_entry(tmp, struct task_struct, run_list);
  if (can_schedule(p, this_cpu)) {
int weight = goodness(p, this_cpu, prev->active_mm);
if (weight > c)
c = weight, next = p;
  }
}
/* Do we need to re-calculate counters? */
if (!c)
  goto recalculate;
/*
  * from this point on nothing can prevent us from
  * switching to the next task, save this fact in
  * sched_data.
  */
sched_data->curr = next;
#ifdef CONFIG_SMP
  next->has_cpu = 1;
next->processor = this_cpu;
#endif
spin_unlock_irq(&runqueue_lock);
if (prev == next)
  goto same_process;
#ifdef CONFIG_SMP
  /*
* maintain the per-process 'last schedule' value.
* (this has to be recalculated even if we reschedule to
* the same process) Currently this is only used on SMP,
  * and it's approximate, so we do not have to maintain
  * it while holding the runqueue spinlock.
*/
  sched_data->last_schedule = get_cycles();
/*
  * We drop the scheduler lock early (it's a global spinlock),
  * thus we have to lock the previous process from getting
  * rescheduled during switch_to().
  */
#endif /* CONFIG_SMP */
kstat.context_swtch++;
/*
  * there are 3 processes which are affected by a context switch:
  *
  * prev == .... ==> (last => next)
  *
  * It's the 'much more previous' 'prev' that is on next's stack,
  * but prev is set to (the just run) 'last' process by switch_to().
  * This might sound slightly confusing but makes tons of sense.
  */
prepare_to_switch();
{
  struct mm_struct *mm = next->mm;
  struct mm_struct *oldmm = prev->active_mm;
  if (!mm) {
if (next->active_mm) BUG();
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next, this_cpu);
  } else {
if (next->active_mm != mm) BUG();
switch_mm(oldmm, mm, next, this_cpu);
  }
  if (!prev->mm) {
prev->active_mm = NULL;
mmdrop(oldmm);
  }
}
/*
  * This just switches the register state and the
  * stack.
  */
switch_to(prev, next, prev);
__schedule_tail(prev);
same_process:
reacquire_kernel_lock(current);
if (current->need_resched)
  goto need_resched_back;
#ifdef CONFIG_PREEMPT
ctx_sw_on_no_preempt();
#endif
return;
recalculate:
{
  struct task_struct *p;
  spin_unlock_irq(&runqueue_lock);
  read_lock(&tasklist_lock);
  for_each_task(p)
p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
  read_unlock(&tasklist_lock);
  spin_lock_irq(&runqueue_lock);
}
goto repeat_schedule;
still_running:
c = goodness(prev, this_cpu, prev->active_mm);
next = prev;
goto still_running_back;
handle_softirq:
do_softirq();
goto handle_softirq_back;
move_rr_last:
if (!prev->counter) {
  prev->counter = NICE_TO_TICKS(prev->nice);
  move_last_runqueue(prev);
}
goto move_rr_back;
scheduling_in_interrupt:
printk("Scheduling in interrupt\n");
BUG();
return;
}
void schedule_tail(struct task_struct *prev)
{
__schedule_tail(prev);
#ifdef CONFIG_PREEMPT
ctx_sw_on();
#endif
}

		自动登录	找回密码
密码			注册

全国各地医院查询	重量转换换算	RGB颜色查询	交通标志大全	各类快递查询
简体繁体转换	黄金价格实时走势	万年历查询	实时汇率转换	列车时刻查询
在线翻译工具	CSS中文手册	HTML学习教程	MySQL中文手册	JavaScript中文手册
PHP安全基础手册	PHP5面向对象编程教程	正则表达式系统教程	SQL Server精华	Apache 2.2 中文手册
DOS命令全集指南	windows脚本技术中文版	股票行情查询	历史上的今天	邮编区号查询
长度转换换算	货币汇率转换	常用电话号码	体育彩票查询	手机位置查询
域名Whois信息查询	谷歌PR值查询	台州网站建设	台州网站开发	台州域名注册
天气预报查询	长度转换换算器	在线电子地图	车牌号码查询	中国百家姓查询

Linux内核抢占补丁的基本原理

相关帖子