From: Tomas Hruby Date: Wed, 15 Sep 2010 14:11:21 +0000 (+0000) Subject: SMP - can boot even if some cpus fail to boot X-Git-Tag: v3.2.0~840 X-Git-Url: http://zhaoyanbai.com/repos/?a=commitdiff_plain;h=1f89845bb257bc373c6d05c5fd4bf7ea5547047d;p=minix.git SMP - can boot even if some cpus fail to boot - EBADCPU is returned is scheduler tries to run a process on a CPU that either does not exist or isn't booted - this change was originally meant to deal with stupid cpuid instruction which provides totally useless information about hyper-threading and MPS which does not deal with ht at all. ACPI provides correct information. If ht is turned off it looks like some CPUs failed to boot. Nevertheless this patch may be handy for testing/benchmarking in the future. --- diff --git a/include/errno.h b/include/errno.h index d95ba2c6f..4985335ec 100644 --- a/include/errno.h +++ b/include/errno.h @@ -130,4 +130,6 @@ extern int errno; /* place where the error numbers go */ #define EBADEPT (_SIGN 301) /* specified endpoint is bad */ #define EDEADEPT (_SIGN 302) /* specified endpoint is not alive */ +#define EBADCPU (_SIGN 1000) /* requested CPU does not work */ + #endif /* _ERRNO_H */ diff --git a/kernel/arch/i386/arch_smp.c b/kernel/arch/i386/arch_smp.c index 4f1445742..1d4a59bf8 100644 --- a/kernel/arch/i386/arch_smp.c +++ b/kernel/arch/i386/arch_smp.c @@ -36,8 +36,8 @@ extern void * __trampoline_end; extern u32_t busclock[CONFIG_MAX_CPUS]; extern int panicking; -static int ap_cpu_ready; -static int cpu_down; +static int volatile ap_cpu_ready; +static int volatile cpu_down; /* there can be at most 255 local APIC ids, each fits in 8 bits */ PRIVATE unsigned char apicid2cpuid[255]; @@ -186,6 +186,11 @@ PUBLIC void smp_shutdown_aps(void) for (cpu = 0; cpu < ncpus; cpu++) { if (cpu == cpuid) continue; + if (!cpu_test_flag(cpu, CPU_IS_READY)) { + printf("CPU %d didn't boot\n", cpu); + continue; + } + cpu_down = -1; barrier(); apic_send_ipi(APIC_SMP_CPU_HALT_VECTOR, cpu, APIC_IPI_DEST); diff --git a/kernel/smp.c b/kernel/smp.c index 38d503457..2dd730491 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -27,9 +27,20 @@ SPINLOCK_DEFINE(boot_lock) PUBLIC void wait_for_APs_to_finish_booting(void) { + unsigned n = 0; + int i; + + /* check how many cpus are actually alive */ + for (i = 0 ; i < ncpus ; i++) { + if (cpu_test_flag(i, CPU_IS_READY)) + n++; + } + if (n != ncpus) + printf("WARNING only %d out of %d cpus booted\n", n, ncpus); + /* we must let the other CPUs to run in kernel mode first */ BKL_UNLOCK(); - while (ap_cpus_booted != (ncpus - 1)) + while (ap_cpus_booted != (n - 1)) arch_pause(); /* now we have to take the lock again as we continu execution */ BKL_LOCK(); diff --git a/kernel/system.c b/kernel/system.c index 0c79ae7a6..cf0fe9ba0 100644 --- a/kernel/system.c +++ b/kernel/system.c @@ -653,6 +653,8 @@ PUBLIC int sched_proc(struct proc *p, #ifdef CONFIG_SMP if ((cpu < 0 && cpu != -1) || (cpu > 0 && (unsigned) cpu >= ncpus)) return(EINVAL); + if (cpu != -1 && !(cpu_is_ready(cpu))) + return EBADCPU; #endif /* In some cases, we might be rescheduling a runnable process. In such diff --git a/servers/pm/glo.h b/servers/pm/glo.h index 76feac5c1..69631d990 100644 --- a/servers/pm/glo.h +++ b/servers/pm/glo.h @@ -29,5 +29,5 @@ EXTERN char monitor_code[256]; EXTERN struct machine machine; /* machine info */ #ifdef CONFIG_SMP -EXTERN unsigned cpu_proc[CONFIG_MAX_CPUS]; +EXTERN int cpu_proc[CONFIG_MAX_CPUS]; #endif diff --git a/servers/sched/schedule.c b/servers/sched/schedule.c index e73e3ac4e..39e8986e7 100644 --- a/servers/sched/schedule.c +++ b/servers/sched/schedule.c @@ -38,6 +38,10 @@ FORWARD _PROTOTYPE( void balance_queues, (struct timer *tp) ); #define schedule_process_migrate(p) \ schedule_process(p, SCHEDULE_CHANGE_CPU) +#define CPU_DEAD -1 + +#define cpu_is_available(c) (cpu_proc[c] >= 0) + #define DEFAULT_USER_TIME_SLICE 200 /* processes created by RS are sysytem processes */ @@ -62,7 +66,12 @@ PRIVATE void pick_cpu(struct schedproc * proc) return; } + /* if no other cpu available, try BSP */ + cpu = machine.bsp_id; for (c = 0; c < machine.processors_count; c++) { + /* skip dead cpus */ + if (!cpu_is_available(c)) + continue; if (c != machine.bsp_id && cpu_load > cpu_proc[c]) { cpu_load = cpu_proc[c]; cpu = c; @@ -218,7 +227,13 @@ PUBLIC int do_start_scheduling(message *m_ptr) /* Schedule the process, giving it some quantum */ pick_cpu(rmp); - if ((rv = schedule_process(rmp, SCHEDULE_CHANGE_ALL)) != OK) { + while ((rv = schedule_process(rmp, SCHEDULE_CHANGE_ALL)) == EBADCPU) { + /* don't try this CPU ever again */ + cpu_proc[rmp->cpu] = CPU_DEAD; + pick_cpu(rmp); + } + + if (rv != OK) { printf("Sched: Error while scheduling process, kernel replied %d\n", rv); return rv;