Linux内核中断总结
中断嵌套
早期的内核是可以中断嵌套的,现在不可以。
早期的Linux内核中断分为两种情况:
带IRQF_DISABLED
的快中断,这种中断在执行的时候是不允许新的中断触发,也就是不允许嵌套。
不带IRQF_DISABLED
的慢中断,这种中断在执行中断处理程序的时候是开中断执行的,是允许更高优先级的中断打断自己,嵌套执行的。
在老版本内核中指定一个中断服务程序不想被打断的方法,就是request_irq
的时候添加IRQF_DISABLED
标志,如:
request_irq(HD_IRQ, hd_interrupt, IRQF_DISABLED, "hd", NULL)
但这个能力在2010年被这个e58aa3d2d0cc01ad8d6f7f640a0670433f794922
提交废除了。
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e58aa3d2d0cc
这意味着从这个commit开始,无论设不设置IRQF_DISABLED
,内核都不会在中断服务的时候开启CPU对中断的响应。
在2.6.32(2009/12/03)版本内核中还有这段逻辑。
kernel/irq/handle.c
363 /**
364 * handle_IRQ_event - irq action chain handler
365 * @irq: the interrupt number
366 * @action: the interrupt action chain for this irq
367 *
368 * Handles the action chain of an irq event
369 */
370 irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
371 {
372 irqreturn_t ret, retval = IRQ_NONE;
373 unsigned int status = 0;
374
375 if (!(action->flags & IRQF_DISABLED))
376 local_irq_enable_in_hardirq();
377
378 do {
379 trace_irq_handler_entry(irq, action);
380 ret = action->handler(irq, action->dev_id);
381 trace_irq_handler_exit(irq, action, ret);
382
383 switch (ret) {
384 case IRQ_WAKE_THREAD:
385 /*
386 * Set result to handled so the spurious check
387 * does not trigger.
388 */
389 ret = IRQ_HANDLED;
390
391 /*
392 * Catch drivers which return WAKE_THREAD but
393 * did not set up a thread function
394 */
395 if (unlikely(!action->thread_fn)) {
396 warn_no_thread(irq, action);
397 break;
398 }
399
400 /*
401 * Wake up the handler thread for this
402 * action. In case the thread crashed and was
403 * killed we just pretend that we handled the
404 * interrupt. The hardirq handler above has
405 * disabled the device interrupt, so no irq
406 * storm is lurking.
407 */
408 if (likely(!test_bit(IRQTF_DIED,
409 &action->thread_flags))) {
410 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
411 wake_up_process(action->thread);
412 }
413
414 /* Fall through to add to randomness */
415 case IRQ_HANDLED:
416 status |= action->flags;
417 break;
418
419 default:
420 break;
421 }
422
423 retval |= ret;
424 action = action->next;
425 } while (action);
426
427 if (status & IRQF_SAMPLE_RANDOM)
428 add_interrupt_randomness(irq);
429 local_irq_disable();
430
431 return retval;
432 }
关于local_irq_enable_in_hardirq
的作用就是关闭CPU响应中断。以2.6.32为例其定义如下
include/linux/interrupt.h
182 # define local_irq_enable_in_hardirq() local_irq_enable()
include/linux/irqflags.h
59 #define local_irq_enable() \
60 do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
arch/x86/include/asm/irqflags.h
42 static inline void native_irq_enable(void)
43 {
44 asm volatile("sti": : :"memory");
45 }
79 static inline void raw_local_irq_enable(void)
80 {
81 native_irq_enable();
82 }
在2.6.36(2010/10/20)版本的内核中就没有这一段逻辑了。
361 /**
362 * handle_IRQ_event - irq action chain handler
363 * @irq: the interrupt number
364 * @action: the interrupt action chain for this irq
365 *
366 * Handles the action chain of an irq event
367 */
368 irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
369 {
370 irqreturn_t ret, retval = IRQ_NONE;
371 unsigned int status = 0;
372
373 do {
374 trace_irq_handler_entry(irq, action);
375 ret = action->handler(irq, action->dev_id);
376 trace_irq_handler_exit(irq, action, ret);
377
378 switch (ret) {
379 case IRQ_WAKE_THREAD:
380 /*
381 * Set result to handled so the spurious check
382 * does not trigger.
383 */
384 ret = IRQ_HANDLED;
385
386 /*
387 * Catch drivers which return WAKE_THREAD but
388 * did not set up a thread function
389 */
390 if (unlikely(!action->thread_fn)) {
391 warn_no_thread(irq, action);
392 break;
393 }
394
395 /*
396 * Wake up the handler thread for this
397 * action. In case the thread crashed and was
398 * killed we just pretend that we handled the
399 * interrupt. The hardirq handler above has
400 * disabled the device interrupt, so no irq
401 * storm is lurking.
402 */
403 if (likely(!test_bit(IRQTF_DIED,
404 &action->thread_flags))) {
405 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
406 wake_up_process(action->thread);
407 }
408
409 /* Fall through to add to randomness */
410 case IRQ_HANDLED:
411 status |= action->flags;
412 break;
413
414 default:
415 break;
416 }
417
418 retval |= ret;
419 action = action->next;
420 } while (action);
421
422 if (status & IRQF_SAMPLE_RANDOM)
423 add_interrupt_randomness(irq);
424 local_irq_disable();
425
426 return retval;
427 }
时钟中断对时间片用尽的处理方式
参考的是2.4.0版本的内核
arch/i386/kernel/time.c
454 /*
455 * This is the same as the above, except we _also_ save the current
456 * Time Stamp Counter value at the time of the timer interrupt, so that
457 * we later on can estimate the time of day more exactly.
458 */
459 static void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
460 {
......
501 do_timer_interrupt(irq, NULL, regs);
......
505 }
380 /*
381 * timer_interrupt() needs to keep up the real-time clock,
382 * as well as call the "do_timer()" routine every clocktick
383 */
384 static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
385 {
......
406 do_timer(regs);
......
450 }
kernel/timer.c
674 void do_timer(struct pt_regs *regs)
675 {
676 (*(unsigned long *)&jiffies)++;
677 #ifndef CONFIG_SMP
678 /* SMP process accounting uses the local APIC timer */
679
680 update_process_times(user_mode(regs));
681 #endif
682 mark_bh(TIMER_BH);
683 if (TQ_ACTIVE(tq_timer))
684 mark_bh(TQUEUE_BH);
685 }
575 /*
576 * Called from the timer interrupt handler to charge one tick to the current
577 * process. user_tick is 1 if the tick is user time, 0 for system.
578 */
579 void update_process_times(int user_tick)
580 {
581 struct task_struct *p = current;
582 int cpu = smp_processor_id(), system = user_tick ^ 1;
583
584 update_one_process(p, user_tick, system, cpu);
585 if (p->pid) {
586 if (--p->counter <= 0) {
587 p->counter = 0;
588 p->need_resched = 1;
589 }
590 if (p->nice > 0)
591 kstat.per_cpu_nice[cpu] += user_tick;
592 else
593 kstat.per_cpu_user[cpu] += user_tick;
594 kstat.per_cpu_system[cpu] += system;
595 } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
596 kstat.per_cpu_system[cpu] += system;
597 }
在时钟中断处理函数timer_interrupt
中调用do_timer_interrupt
再调用到do_timer
最终调用到update_process_times
,在该函数中会将当前进程的时间片减1,如果发现当前进程时间片用尽,则将其need_resched
标记置1。need_resched
定义在task_struct
里,可以看到其位置在第20
字节处(mm_segment_t为4字节)。
include/asm-i386/processor.h
323 typedef struct {
324 unsigned long seg;
325 } mm_segment_t;
include/linux/sched.h
277 struct task_struct {
278 /*
279 * offsets of these are hardcoded elsewhere - touch with care
280 */
281 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
282 unsigned long flags; /* per process flags, defined below */
283 int sigpending;
284 mm_segment_t addr_limit; /* thread address space:
285 0-0xBFFFFFFF for user-thead
286 0-0xFFFFFFFF for kernel-thread
287 */
288 struct exec_domain *exec_domain;
289 volatile long need_resched;
290 unsigned long ptrace;
291
292 int lock_depth; /* Lock depth */
293
294 /*
295 * offset 32 begins here on 32-bit platforms. We keep
296 * all fields in a single cacheline that are needed for
297 * the goodness() loop in schedule().
298 */
299 long counter;
300 long nice;
301 unsigned long policy;
在从内核返回到用户空间前,会判断need_resched
这个值,如果被置1,则会在ret_with_reschedule
时调用reschedule
最终会调用到schedule
。
71 /*
72 * these are offsets into the task-struct.
73 */
74 state = 0
75 flags = 4
76 sigpending = 8
77 addr_limit = 12
78 exec_domain = 16
79 need_resched = 20
80 tsk_ptrace = 24
81 processor = 52
188 /*
189 * Return to user mode is not as complex as all this looks,
190 * but we want the default path for a system call return to
191 * go as quickly as possible which is why some of this is
192 * less clear than it otherwise should be.
193 */
194
195 ENTRY(system_call)
196 pushl %eax # save orig_eax
197 SAVE_ALL
198 GET_CURRENT(%ebx)
199 cmpl $(NR_syscalls),%eax
200 jae badsys
201 testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS
202 jne tracesys
203 call *SYMBOL_NAME(sys_call_table)(,%eax,4)
204 movl %eax,EAX(%esp) # save the return value
205 ENTRY(ret_from_sys_call)
206 #ifdef CONFIG_SMP
207 movl processor(%ebx),%eax
208 shll $CONFIG_X86_L1_CACHE_SHIFT,%eax
209 movl SYMBOL_NAME(irq_stat)(,%eax),%ecx # softirq_active
210 testl SYMBOL_NAME(irq_stat)+4(,%eax),%ecx # softirq_mask
211 #else
212 movl SYMBOL_NAME(irq_stat),%ecx # softirq_active
213 testl SYMBOL_NAME(irq_stat)+4,%ecx # softirq_mask
214 #endif
215 jne handle_softirq
216
217 ret_with_reschedule:
218 cmpl $0,need_resched(%ebx)
219 jne reschedule
220 cmpl $0,sigpending(%ebx)
221 jne signal_return
222 restore_all:
223 RESTORE_ALL
287 reschedule:
288 call SYMBOL_NAME(schedule) # test
289 jmp ret_from_sys_call
need_resched
就是在schedule
里被清零的。而counter
也会在这里被重新填值。
498 /*
499 * 'schedule()' is the scheduler function. It's a very simple and nice
500 * scheduler: it's not perfect, but certainly works for most things.
501 *
502 * The goto is "interesting".
503 *
504 * NOTE!! Task 0 is the 'idle' task, which gets called when no other
505 * tasks can run. It can not be killed, and it cannot sleep. The 'state'
506 * information in task[0] is never used.
507 */
508 asmlinkage void schedule(void)
509 {
510 struct schedule_data * sched_data;
511 struct task_struct *prev, *next, *p;
512 struct list_head *tmp;
......
517 prev = current;
......
542
543 switch (prev->state) {
544 case TASK_INTERRUPTIBLE:
545 if (signal_pending(prev)) {
546 prev->state = TASK_RUNNING;
547 break;
548 }
549 default:
550 del_from_runqueue(prev);
551 case TASK_RUNNING:
552 }
553 prev->need_resched = 0;
......
656 return;
657
658 recalculate:
659 {
660 struct task_struct *p;
661 spin_unlock_irq(&runqueue_lock);
662 read_lock(&tasklist_lock);
663 for_each_task(p)
664 p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
665 read_unlock(&tasklist_lock);
666 spin_lock_irq(&runqueue_lock);
667 }
668 goto repeat_schedule;
669
670 still_running:
671 c = goodness(prev, this_cpu, prev->active_mm);
672 next = prev;
673 goto still_running_back;
674
675 handle_softirq:
676 do_softirq();
677 goto handle_softirq_back;
678
679 move_rr_last:
680 if (!prev->counter) {
681 prev->counter = NICE_TO_TICKS(prev->nice);
682 move_last_runqueue(prev);
683 }
684 goto move_rr_back;
685
686 scheduling_in_interrupt:
687 printk("Scheduling in interrupt\n");
688 BUG();
689 return;
690 }
这里还有一点需要特别解释一下,就是为什么在update_process_times
里586
行处对时间片减1后,判断是否用尽的判断方法是if (--p->counter <= 0)
而不是if (--p->counter == 0)
?
因为可能出现如下的执行序列,前一个时钟中断在将counter
减到0后,开始中断底半部处理工作,其处理耗时可能会比较长,甚至长过一个时钟中断周期,而底半处理又是在开中断的情况下执行的,因此有可能前一个时钟中断还没执行到重新调度逻辑(在这个逻辑里会重置need_resched
和counter
),又一次时钟中断发生,而这次时钟中中断必然会将counter
从0
减到负数.