标签:
主要介绍内核抢占的相关概念和具体实现,以及抢占对内核调度和内核竞态和同步的一些影响。
抢占计数8位, PREEMPT_MASK => 0x000000ff
软中断计数8位, SOFTIRQ_MASK => 0x0000ff00
硬中断计数4位, HARDIRQ_MASK => 0x000f0000
不可屏蔽中断1位, NMI_MASK => 0x00100000
PREEMPTIVE_ACTIVE => 0x00200000
调度标识1位, PREEMPT_NEED_RESCHED => 0x80000000
__preempt_count的作用
thread_info的flags
__preempt_count的相关操作
/////// need_resched标识相关 ///////
// PREEMPT_NEED_RESCHED位如果是0表示需要调度
#define PREEMPT_NEED_RESCHED 0x80000000
static __always_inline void set_preempt_need_resched(void)
{
// __preempt_count最高位清零表示need_resched
raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}
static __always_inline void clear_preempt_need_resched(void)
{
// __preempt_count最高位置位
raw_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
}
static __always_inline bool test_preempt_need_resched(void)
{
return !(raw_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
}
// 是否需要重新调度,两个条件:1. 抢占计数为0;2. 最高位清零
static __always_inline bool should_resched(void)
{
return unlikely(!raw_cpu_read_4(__preempt_count));
}
////////// 抢占计数相关 ////////
#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED)
#define PREEMPT_DISABLE (1 + PREEMPT_ENABLED)
// 读取__preempt_count,忽略need_resched标识位
static __always_inline int preempt_count(void)
{
return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}
static __always_inline void __preempt_count_add(int val)
{
raw_cpu_add_4(__preempt_count, val);
}
static __always_inline void __preempt_count_sub(int val)
{
raw_cpu_add_4(__preempt_count, -val);
}
// 抢占计数加1关闭抢占
#define preempt_disable() \
do { preempt_count_inc(); barrier(); } while (0)
// 重新开启抢占,并测试是否需要重新调度
#define preempt_enable() \
do { barrier(); if (unlikely(preempt_count_dec_and_test())) __preempt_schedule(); } while (0)
// 抢占并重新调度
// 这里设置PREEMPT_ACTIVE会对schdule()中的行为有影响
asmlinkage __visible void __sched notrace preempt_schedule(void)
{
// 如果抢占计数不为0或者没有开中断,则不调度
if (likely(!preemptible()))
return;
do {
__preempt_count_add(PREEMPT_ACTIVE);
__schedule();
__preempt_count_sub(PREEMPT_ACTIVE);
barrier();
} while (need_resched());
}
// 检查thread_info flags
static __always_inline bool need_resched(void)
{
return unlikely(tif_need_resched());
}
////// 中断相关 ////////
// 硬件中断计数
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
// 软中断计数
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
// 中断计数
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
| NMI_MASK))
// 是否处于外部中断上下文
#define in_irq() (hardirq_count())
// 是否处于软中断上下文
#define in_softirq() (softirq_count())
// 是否处于中断上下文
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
// 是否处于不可屏蔽中断环境
#define in_nmi() (preempt_count() & NMI_MASK)
// 是否可抢占 : 抢占计数为0并且没有处在关闭抢占的环境中
# define preemptible() (preempt_count() == 0 && !irqs_disabled())
(arch/x86/kernel/entry_64.S)
系统调用入口基本流程
中断入口基本流程
// 系统调用的处理逻辑
ENTRY(system_call)
/* ... 省略 ... */
// 保存当前栈顶指针到percpu变量
movq %rsp,PER_CPU_VAR(old_rsp)
// 将内核栈底指针赋于rsp,即移到内核栈
movq PER_CPU_VAR(kernel_stack),%rsp
/* ... 省略 ... */
system_call_fastpath:
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
#else
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
ja ret_from_sys_call /* and return regs->ax */
movq %r10,%rcx
// 系统调用
call *sys_call_table(,%rax,8) # XXX: rip relative
movq %rax,RAX-ARGOFFSET(%rsp)
ret_from_sys_call:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: flagmask */
// 返回时需要检查thread_info的flags
sysret_check:
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
andl %edi,%edx
jnz sysret_careful // 如果有thread_info flags需要处理,比如need_resched
//// 直接返回
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
*/
TRACE_IRQS_ON
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
RESTORE_ARGS 1,-ARG_SKIP,0
/*CFI_REGISTER rflags,r11*/
// 恢复之前保存percpu变量中的栈顶地址(rsp)
movq PER_CPU_VAR(old_rsp), %rsp
// 返回用户空间
USERGS_SYSRET64
CFI_RESTORE_STATE
//// 如果thread_info的标识被设置了,则需要处理后返回
/* Handle reschedules */
sysret_careful:
bt $TIF_NEED_RESCHED,%edx // 检查是否需要重新调度
jnc sysret_signal // 有信号
// 没有信号则处理need_resched
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
SCHEDULE_USER // 调用schedule(),返回用户态不需要检查__preempt_count
popq_cfi %rdi
jmp sysret_check // 再一次检查
// 如果有信号发生,则需要处理信号
sysret_signal:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
// 如果有信号,无条件跳转
jmp int_check_syscall_exit_work
/* ... 省略 ... */
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
GLOBAL(int_with_check)
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz int_careful
andl $~TS_COMPAT,TI_status(%rcx)
jmp retint_swapgs
/* Either reschedule or signal or syscall exit tracking needed. */
/* First do a reschedule test. */
/* edx: work, edi: workmask */
int_careful:
bt $TIF_NEED_RESCHED,%edx
jnc int_very_careful // 如果不只need_resched,跳转
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
SCHEDULE_USER // 调度schedule
popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check // 再次去检查
/* handle signals and tracing -- both require a full stack frame */
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
int_check_syscall_exit_work:
SAVE_REST
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
pushq_cfi %rdi
leaq 8(%rsp),%rdi # &ptregs -> arg1
call syscall_trace_leave
popq_cfi %rdi
andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
jmp int_restore_rest
int_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz 1f
movq %rsp,%rdi # &ptregs -> arg1
xorl %esi,%esi # oldset -> arg2
call do_notify_resume
1: movl $_TIF_WORK_MASK,%edi
int_restore_rest:
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check // 再次检查thread_info flags
CFI_ENDPROC
END(system_call)
// 中断入口基本流程
// 调用do_IRQ的函数wrapper
.macro interrupt func
subq $ORIG_RAX-RBP, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
SAVE_ARGS_IRQ // 进入中断处理上下文时保存寄存器
call \func
/*... 省略 ...*/
common_interrupt:
/*... 省略 ...*/
interrupt do_IRQ // 调用c函数do_IRQ实际处理中断
ret_from_intr: // 中断返回
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count) // 减少irq计数
/* Restore saved previous stack */
// 恢复之前的栈
popq %rsi
CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
leaq ARGOFFSET-RBP(%rsi), %rsp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp) // 检查是否中断了内核
je retint_kernel // 从中断返回内核空间
/* Interrupt came from user space */
/*
* Has a correct top of stack, but a partial stack frame
* %rcx: thread info. Interrupts off.
*/
// 用户空间被中断,返回用户空间
retint_with_reschedule:
movl $_TIF_WORK_MASK,%edi
retint_check:
LOCKDEP_SYS_EXIT_IRQ
movl TI_flags(%rcx),%edx
andl %edi,%edx
CFI_REMEMBER_STATE
jnz retint_careful // 需要处理need_resched
retint_swapgs: /* return to user-space */
/*
* The iretq could re-enable interrupts:
*/
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
SWAPGS
jmp restore_args
retint_restore_args: /* return to kernel space */
DISABLE_INTERRUPTS(CLBR_ANY)
/*
* The iretq could re-enable interrupts:
*/
TRACE_IRQS_IRETQ
restore_args:
RESTORE_ARGS 1,8,1
irq_return:
INTERRUPT_RETURN // native_irq进入
ENTRY(native_iret)
/*... 省略 ...*/
/* edi: workmask, edx: work */
retint_careful:
CFI_RESTORE_STATE
bt $TIF_NEED_RESCHED,%edx
jnc retint_signal // 需要处理信号
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
SCHEDULE_USER // 返回用户空间之前调度schedule
popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp retint_check // 再次检查thread_info flags
retint_signal:
testl $_TIF_DO_NOTIFY_MASK,%edx
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
RESTORE_REST
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
jmp retint_with_reschedule // 处理完信号,再次跳转处理need_resched
//// 注意,如果内核配置支持抢占,则返回内核时使用这个retint_kernel
#ifdef CONFIG_PREEMPT
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
ENTRY(retint_kernel)
// 检查__preempt_count是否为0
cmpl $0,PER_CPU_VAR(__preempt_count)
jnz retint_restore_args // 不为0,则禁止抢占
bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
jnc retint_restore_args
call preempt_schedule_irq // 可以抢占内核
jmp exit_intr // 再次检查
#endif
CFI_ENDPROC
END(common_interrupt)
标签:
原文地址:http://blog.csdn.net/feilengcui008/article/details/51705667