diff --git a/arch/arm/common/time-acorn.c b/arch/arm/common/time-acorn.c index deeed56..b8adf8f 100644 --- a/arch/arm/common/time-acorn.c +++ b/arch/arm/common/time-acorn.c @@ -75,7 +75,7 @@ ioc_timer_interrupt(int irq, void *dev_id) static struct irqaction ioc_timer_irq = { .name = "timer", - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .handler = ioc_timer_interrupt }; diff --git a/arch/arm/mach-at91/gpio.c b/arch/arm/mach-at91/gpio.c index 9b0447c..9a92e5b 100644 --- a/arch/arm/mach-at91/gpio.c +++ b/arch/arm/mach-at91/gpio.c @@ -368,12 +368,18 @@ static int gpio_irq_type(unsigned pin, unsigned type) } } +static void gpio_irq_ack_noop(unsigned int irq) +{ + /* Dummy function. */ +} + static struct irq_chip gpio_irqchip = { .name = "GPIO", .mask = gpio_irq_mask, .unmask = gpio_irq_unmask, .set_type = gpio_irq_type, .set_wake = gpio_irq_set_wake, + .ack = gpio_irq_ack_noop, }; static void gpio_irq_handler(unsigned irq, struct irq_desc *desc) @@ -522,7 +528,7 @@ void __init at91_gpio_irq_setup(void) * shorter, and the AIC handles interrupts sanely. */ set_irq_chip(pin, &gpio_irqchip); - set_irq_handler(pin, handle_simple_irq); + set_irq_handler(pin, handle_edge_irq); set_irq_flags(pin, IRQF_VALID); } diff --git a/arch/arm/oprofile/op_model_xscale.c b/arch/arm/oprofile/op_model_xscale.c index 724ab9c..9f5fdbb 100644 --- a/arch/arm/oprofile/op_model_xscale.c +++ b/arch/arm/oprofile/op_model_xscale.c @@ -381,8 +381,9 @@ static int xscale_pmu_start(void) { int ret; u32 pmnc = read_pmnc(); + int irq_flags = IRQF_DISABLED | IRQF_NODELAY; - ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED, + ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, irq_flags, "XScale PMU", (void *)results); if (ret < 0) { diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index 3f2d774..c33772f 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -74,9 +74,11 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { idle(); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } diff --git a/arch/m68knommu/platform/coldfire/pit.c b/arch/m68knommu/platform/coldfire/pit.c index c5b9167..0867b76 100644 --- a/arch/m68knommu/platform/coldfire/pit.c +++ b/arch/m68knommu/platform/coldfire/pit.c @@ -119,7 +119,7 @@ static irqreturn_t pit_tick(int irq, void *dummy) static struct irqaction pit_irq = { .name = "timer", - .flags = IRQF_DISABLED | IRQF_TIMER, + .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_NODELAY, .handler = pit_tick, }; diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index b6ac551..5977fef 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -97,7 +97,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 7ecc0d1..99e4e93 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1031,7 +1031,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -1045,7 +1045,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED) bne- do_resched andi. r0,r9,_TIF_USER_WORK_MASK beq restore_user diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e0bcf93..ba0cfa5 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -644,16 +644,23 @@ do_work: user_work: #endif - /* Enable interrupts */ - ori r10,r10,MSR_EE - mtmsrd r10,1 - andi. r0,r4,_TIF_NEED_RESCHED beq 1f - bl .schedule + + /* preempt_schedule_irq() expects interrupts disabled. */ + bl .preempt_schedule_irq b .ret_from_except_lite -1: bl .save_nvgprs + /* here we are preempting the current task */ +1: li r0,1 + stb r0,PACASOFTIRQEN(r13) + stb r0,PACAHARDIRQEN(r13) + + /* Enable interrupts */ + ori r10,r10,MSR_EE + mtmsrd r10,1 + + bl .save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl .do_signal b .ret_from_except diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9043251..68ed633 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -515,6 +515,19 @@ static void __unmask_and_level_IO_APIC_irq(unsigned int irq) } #endif /* CONFIG_X86_32 */ +/* trigger = 0 (edge mode) */ +static void __pcix_mask_IO_APIC_irq (unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, 0x00008000, NULL); +} + +/* mask = 0, trigger = 1 (level mode) */ +static void __pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + io_apic_modify_irq(irq, ~0x00010000, 0x00008000, NULL); +} + + static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -533,6 +546,24 @@ static void unmask_IO_APIC_irq (unsigned int irq) spin_unlock_irqrestore(&ioapic_lock, flags); } +static void pcix_mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void pcix_unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __pcix_unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; @@ -1186,6 +1217,9 @@ static struct irq_chip ioapic_chip; #ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip; #endif +static struct irq_chip pcix_ioapic_chip; + + #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -1215,9 +1249,10 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, unsigned long trigger, int pcix) { struct irq_desc *desc; + struct irq_chip *chip = pcix ? &pcix_ioapic_chip : &ioapic_chip; desc = irq_to_desc(irq); @@ -1242,12 +1277,11 @@ static void ioapic_register_intr(int irq, unsigned long trigger) #endif if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); + set_irq_chip_and_handler_name(irq, chip, handle_fasteoi_irq, + pcix ? "pcix-fasteoi" : "fasteoi"); else - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + set_irq_chip_and_handler_name(irq, chip, handle_edge_irq, + pcix ? "pcix-edge" : "edge"); } static int setup_ioapic_entry(int apic, int irq, @@ -1345,7 +1379,7 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, return; } - ioapic_register_intr(irq, trigger); + ioapic_register_intr(irq, trigger, apic > 0); if (irq < 16) disable_8259A_irq(irq); @@ -2373,13 +2407,22 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq(irq); } +#if (defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE)) && \ + defined(CONFIG_PREEMPT_HARDIRQS) + /* + * With threaded interrupts, we always have IRQ_INPROGRESS + * when acking. + */ + else if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) + move_masked_irq(irq); +#endif #ifdef CONFIG_X86_32 if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); spin_unlock(&ioapic_lock); } #endif @@ -2413,6 +2456,19 @@ static struct irq_chip ir_ioapic_chip __read_mostly = { }; #endif +static struct irq_chip pcix_ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = pcix_mask_IO_APIC_irq, + .unmask = pcix_unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + static inline void init_IO_APIC_traps(void) { int irq; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 845aa98..bcd3f97 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -52,6 +52,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) */ static struct irqaction fpu_irq = { .handler = math_error_irq, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "fpu", }; @@ -86,6 +87,7 @@ void __init init_ISA_irqs (void) */ static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff02353..980f873 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -111,6 +111,7 @@ static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index cb19d65..875b7a3 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -113,7 +113,8 @@ unsigned long __init calibrate_cpu(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | + IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 0c9667f..75d4a06 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -651,11 +651,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 37b9ae4..0a347ba 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -84,7 +84,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index a580b95..b159a76 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -20,6 +20,7 @@ void __init pre_intr_init_hook(void) */ static struct irqaction irq2 = { .handler = no_action, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "cascade", }; @@ -46,7 +47,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index b1ac63a..3e738fb 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -78,11 +78,13 @@ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) dev->last_rx = jiffies; - /* it's OK to use per_cpu_ptr() because BHs are off */ pcpu_lstats = dev->ml_priv; + /* BHs off are not enough when softirqs are threads */ + preempt_disable(); lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id()); lb_stats->bytes += skb->len; lb_stats->packets++; + preempt_enable(); netif_rx(skb); diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 12c07c1..9b3960e 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -116,4 +116,16 @@ extern void warn_slowpath(const char *file, const int line, # define WARN_ON_SMP(x) do { } while (0) #endif +#ifdef CONFIG_PREEMPT_RT +# define BUG_ON_RT(c) BUG_ON(c) +# define BUG_ON_NONRT(c) do { } while (0) +# define WARN_ON_RT(condition) WARN_ON(condition) +# define WARN_ON_NONRT(condition) do { } while (0) +#else +# define BUG_ON_RT(c) do { } while (0) +# define BUG_ON_NONRT(c) BUG_ON(c) +# define WARN_ON_RT(condition) do { } while (0) +# define WARN_ON_NONRT(condition) WARN_ON(condition) +#endif + #endif diff --git a/include/asm-generic/cmpxchg-local.h b/include/asm-generic/cmpxchg-local.h index b2ba2fc..9793123 100644 --- a/include/asm-generic/cmpxchg-local.h +++ b/include/asm-generic/cmpxchg-local.h @@ -20,7 +20,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, if (size == 8 && sizeof(unsigned long) != 8) wrong_size_cmpxchg(ptr); - local_irq_save(flags); + raw_local_irq_save(flags); switch (size) { case 1: prev = *(u8 *)ptr; if (prev == old) @@ -41,7 +41,7 @@ static inline unsigned long __cmpxchg_local_generic(volatile void *ptr, default: wrong_size_cmpxchg(ptr); } - local_irq_restore(flags); + raw_local_irq_restore(flags); return prev; } @@ -54,11 +54,11 @@ static inline u64 __cmpxchg64_local_generic(volatile void *ptr, u64 prev; unsigned long flags; - local_irq_save(flags); + raw_local_irq_save(flags); prev = *(u64 *)ptr; if (prev == old) *(u64 *)ptr = new; - local_irq_restore(flags); + raw_local_irq_restore(flags); return prev; } diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 777dbf6..27b1bcf 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -2,7 +2,6 @@ #define _LINUX_BH_H extern void local_bh_disable(void); -extern void __local_bh_enable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 181006c..8613302 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -68,9 +68,9 @@ * Are we doing bottom half or hardware interrupt processing? * Are we in a softirq context? Interrupt context? */ -#define in_irq() (hardirq_count()) -#define in_softirq() (softirq_count()) -#define in_interrupt() (irq_count()) +#define in_irq() (hardirq_count() || (current->flags & PF_EXT_HARDIRQ)) +#define in_softirq() (softirq_count() || (current->flags & PF_EXT_SOFTIRQ)) +#define in_interrupt() (irq_count()) #if defined(CONFIG_PREEMPT) # define PREEMPT_INATOMIC_BASE kernel_locked() diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3eba438..428d015 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -202,6 +202,9 @@ struct hrtimer_cpu_base { int hres_active; unsigned long nr_events; #endif +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; +#endif }; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) @@ -384,6 +387,13 @@ static inline int hrtimer_restart(struct hrtimer *timer) return hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* Softirq preemption could deadlock timer removal */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); +#else +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) +#endif + /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd890..b353370 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -169,6 +169,7 @@ extern struct group_info init_groups; .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ .fs_excl = ATOMIC_INIT(0), \ + .posix_timer_list = NULL, \ .pi_lock = __SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ .pids = { \ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f58a0cf..fee8201 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -53,10 +53,12 @@ #define IRQF_SAMPLE_RANDOM 0x00000040 #define IRQF_SHARED 0x00000080 #define IRQF_PROBE_SHARED 0x00000100 -#define IRQF_TIMER 0x00000200 +#define __IRQF_TIMER 0x00000200 #define IRQF_PERCPU 0x00000400 #define IRQF_NOBALANCING 0x00000800 #define IRQF_IRQPOLL 0x00001000 +#define IRQF_NODELAY 0x00002000 +#define IRQF_TIMER (__IRQF_TIMER | IRQF_NODELAY) typedef irqreturn_t (*irq_handler_t)(int, void *); @@ -68,7 +70,7 @@ struct irqaction { void *dev_id; struct irqaction *next; int irq; - struct proc_dir_entry *dir; + struct proc_dir_entry *dir, *threaded; }; extern irqreturn_t no_action(int cpl, void *dev_id); @@ -223,6 +225,7 @@ static inline int disable_irq_wake(unsigned int irq) #ifndef __ARCH_SET_SOFTIRQ_PENDING #define set_softirq_pending(x) (local_softirq_pending() = (x)) +// FIXME: PREEMPT_RT: set_bit()? #define or_softirq_pending(x) (local_softirq_pending() |= (x)) #endif @@ -268,13 +271,29 @@ struct softirq_action void (*action)(struct softirq_action *); }; +#ifdef CONFIG_PREEMPT_HARDIRQS +# define __raise_softirq_irqoff(nr) raise_softirq_irqoff(nr) +# define __do_raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +# define THREADED_IRQS 1 +#else +# define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +# define __do_raise_softirq_irqoff(nr) __raise_softirq_irqoff(nr) +# define THREADED_IRQS 0 +#endif + asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); +extern void wakeup_irqd(void); + +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern void wait_for_softirq(int softirq); +#else +# define wait_for_softirq(x) do {} while(0) +#endif /* This is the worklist that queues up per-cpu softirq work. * @@ -309,8 +328,9 @@ extern void __send_remote_softirq(struct call_single_data *cp, int cpu, to be executed on some cpu at least once after this. * If the tasklet is already scheduled, but its excecution is still not started, it will be executed only once. - * If this tasklet is already running on another CPU (or schedule is called - from tasklet itself), it is rescheduled for later. + * If this tasklet is already running on another CPU, it is rescheduled + for later. + * Schedule must not be called from the tasklet itself (a lockup occurs) * Tasklet is strictly serialized wrt itself, but not wrt another tasklets. If client needs some intertask synchronization, he makes it with spinlocks. @@ -335,15 +355,25 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data } enum { TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */ + TASKLET_STATE_PENDING /* Tasklet is pending */ }; -#ifdef CONFIG_SMP +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED) +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN) +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING) + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } +static inline int tasklet_tryunlock(struct tasklet_struct *t) +{ + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN; +} + static inline void tasklet_unlock(struct tasklet_struct *t) { smp_mb__before_clear_bit(); @@ -356,6 +386,7 @@ static inline void tasklet_unlock_wait(struct tasklet_struct *t) } #else #define tasklet_trylock(t) 1 +#define tasklet_tryunlock(t) 1 #define tasklet_unlock_wait(t) do { } while (0) #define tasklet_unlock(t) do { } while (0) #endif @@ -390,22 +421,14 @@ static inline void tasklet_disable(struct tasklet_struct *t) smp_mb(); } -static inline void tasklet_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} - -static inline void tasklet_hi_enable(struct tasklet_struct *t) -{ - smp_mb__before_atomic_dec(); - atomic_dec(&t->count); -} +extern void tasklet_enable(struct tasklet_struct *t); +extern void tasklet_hi_enable(struct tasklet_struct *t); extern void tasklet_kill(struct tasklet_struct *t); extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +void takeover_tasklets(unsigned int cpu); /* * Autoprobing for irqs: diff --git a/include/linux/irq.h b/include/linux/irq.h index 3dddfa7..ff83182 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -20,14 +20,17 @@ #include #include #include +#include #include #include #include +#include struct irq_desc; typedef void (*irq_flow_handler_t)(unsigned int irq, struct irq_desc *desc); +typedef void (*irq_thread_handler_t)(struct irq_desc *desc); /* @@ -65,6 +68,7 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_SPURIOUS_DISABLED 0x00800000 /* IRQ was disabled by the spurious trap */ #define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ #define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ +#define IRQ_NODELAY 0x40000000 /* IRQ must run immediately */ #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) @@ -133,6 +137,7 @@ struct irq_chip { * struct irq_desc - interrupt descriptor * @irq: interrupt number for this descriptor * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] + * @handle_irq_thread: highlevel threaded irq-events handler * @chip: low level interrupt hardware access * @msi_desc: MSI descriptor * @handler_data: per-IRQ data for the irq_chip methods @@ -145,6 +150,9 @@ struct irq_chip { * @irq_count: stats field to detect stalled irqs * @irqs_unhandled: stats field for spurious unhandled interrupts * @last_unhandled: aging timer for unhandled count + * @thread: Thread pointer for threaded preemptible irq handling + * @wait_for_handler: Waitqueue to wait for a running preemptible handler + * @cycles: Timestamp for stats and debugging * @lock: locking for SMP * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing @@ -155,6 +163,9 @@ struct irq_chip { struct irq_desc { unsigned int irq; irq_flow_handler_t handle_irq; +#ifdef CONFIG_PREEMPT_HARDIRQS + irq_thread_handler_t handle_irq_thread; +#endif struct irq_chip *chip; struct msi_desc *msi_desc; void *handler_data; @@ -167,6 +178,9 @@ struct irq_desc { unsigned int irq_count; /* For detecting broken IRQs */ unsigned int irqs_unhandled; unsigned long last_unhandled; /* Aging timer for unhandled count */ + struct task_struct *thread; + wait_queue_head_t wait_for_handler; + cycles_t timestamp; spinlock_t lock; #ifdef CONFIG_SMP cpumask_t affinity; @@ -380,7 +394,26 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); #define get_irq_data(irq) (irq_to_desc(irq)->handler_data) #define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) -#endif /* CONFIG_GENERIC_HARDIRQS */ +/* Early initialization of irqs */ +extern void early_init_hardirqs(void); +extern cycles_t irq_timestamp(unsigned int irq); + +#ifdef CONFIG_PREEMPT_HARDIRQS +extern void init_hardirqs(void); +extern void +__set_irq_thread_handler(struct irq_desc *desc, irq_flow_handler_t handle); +#else +static inline void init_hardirqs(void) { } +static inline void +__set_irq_thread_handler(struct irq_desc *desc, irq_flow_handler_t handle) { } +#endif + +#else /* end GENERIC HARDIRQS */ + +static inline void early_init_hardirqs(void) { } +static inline void init_hardirqs(void) { } + +#endif /* !CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e26f549..004e6ef 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -448,7 +448,11 @@ struct netdev_queue { struct Qdisc *qdisc; unsigned long state; spinlock_t _xmit_lock; +#ifdef CONFIG_PREEMPT_SOFTIRQS + struct task_struct *xmit_lock_owner; +#else int xmit_lock_owner; +#endif struct Qdisc *qdisc_sleeping; } ____cacheline_aligned_in_smp; @@ -1509,35 +1513,84 @@ static inline void netif_rx_complete(struct net_device *dev, local_irq_restore(flags); } -static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) +#ifdef CONFIG_PREEMPT_SOFTIRQS +/* + * Use macros to hide the smp_processor_id() call. + */ +static inline void +__assign_net_xmit_lock_owner(struct netdev_queue *txq) +{ + txq->xmit_lock_owner = current; +} +#define assign_net_xmit_lock_owner(txq, cpu) \ + __assign_net_xmit_lock_owner(txq) + +static inline void +clear_net_xmit_lock_owner(struct netdev_queue *txq) +{ + txq->xmit_lock_owner = NULL; +} +static inline int +__test_net_xmit_lock_owner(struct netdev_queue *txq) +{ + return txq->xmit_lock_owner == current; +} +#define test_net_xmit_lock_owner(txq, cpu) \ + __test_net_xmit_lock_owner(txq) + +static inline void ___netif_tx_lock(struct netdev_queue *txq) { spin_lock(&txq->_xmit_lock); + __assign_net_xmit_lock_owner(txq); +} +#define __netif_tx_lock(txq, cpu) \ + ___netif_tx_lock(txq) +#else +static inline void +assign_net_xmit_lock_owner(struct netdev_queue *txq, int cpu) +{ txq->xmit_lock_owner = cpu; } +static inline void +clear_net_xmit_lock_owner(struct netdev_queue *txq) +{ + txq->xmit_lock_owner = -1; +} +static inline int +test_net_xmit_lock_owner(struct netdev_queue *txq, int cpu) +{ + return txq->xmit_lock_owner == cpu; +} +static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) +{ + spin_lock(&txq->_xmit_lock); + assign_net_xmit_lock_owner(txq, cpu); +} +#endif /* CONFIG_PREEMPT_SOFTIRQS */ static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + assign_net_xmit_lock_owner(txq, smp_processor_id()); } static inline int __netif_tx_trylock(struct netdev_queue *txq) { int ok = spin_trylock(&txq->_xmit_lock); if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + assign_net_xmit_lock_owner(txq, smp_processor_id()); return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + clear_net_xmit_lock_owner(txq); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + clear_net_xmit_lock_owner(txq); spin_unlock_bh(&txq->_xmit_lock); } diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index a7c7213..a3920d0 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -115,4 +115,6 @@ long clock_nanosleep_restart(struct restart_block *restart_block); void update_rlimit_cpu(unsigned long rlim_new); +int posix_cpu_thread_init(void); + #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 55e30d1..2e72e36 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -91,6 +91,18 @@ struct sched_param { #include +#ifdef CONFIG_PREEMPT_SOFTIRQS +extern int softirq_preemption; +#else +# define softirq_preemption 0 +#endif + +#ifdef CONFIG_PREEMPT_HARDIRQS +extern int hardirq_preemption; +#else +# define hardirq_preemption 0 +#endif + struct mem_cgroup; struct exec_domain; struct futex_pi_state; @@ -291,6 +303,12 @@ extern void scheduler_tick(void); extern void sched_show_task(struct task_struct *p); +#ifdef CONFIG_GENERIC_HARDIRQS +extern int debug_direct_keyboard; +#else +# define debug_direct_keyboard 0 +#endif + #ifdef CONFIG_DETECT_SOFTLOCKUP extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); @@ -1077,6 +1095,7 @@ struct task_struct { void *stack; atomic_t usage; unsigned int flags; /* per process flags, defined below */ + unsigned int flags_extend; /* more bits for above */ unsigned int ptrace; int lock_depth; /* BKL lock depth */ @@ -1185,6 +1204,8 @@ struct task_struct { struct task_cputime cputime_expires; struct list_head cpu_timers[3]; + struct task_struct *posix_timer_list; + /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; @@ -1567,6 +1588,10 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ #define PF_FREEZER_NOSIG 0x80000000 /* Freezer won't send signals to it */ +/* extended flags */ +#define PF_EXT_SOFTIRQ 0x00000001 /* softirq context */ +#define PF_EXT_HARDIRQ 0x00000002 /* hardirq context */ + /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example @@ -2123,6 +2148,8 @@ static inline int cond_resched_bkl(void) { return _cond_resched(); } +extern int cond_resched_softirq_context(void); +extern int cond_resched_hardirq_context(void); /* * Does a critical section need to be broken due to another @@ -2162,6 +2189,20 @@ static inline void thread_group_cputime_free(struct signal_struct *sig) free_percpu(sig->cputime.totals); } +static inline int softirq_need_resched(void) +{ + if (softirq_preemption && (current->flags & PF_EXT_SOFTIRQ)) + return need_resched(); + return 0; +} + +static inline int hardirq_need_resched(void) +{ + if (hardirq_preemption && (current->flags & PF_EXT_HARDIRQ)) + return need_resched(); + return 0; +} + /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. diff --git a/include/linux/timer.h b/include/linux/timer.h index daf9685..a00f195 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -166,10 +166,12 @@ static inline void add_timer(struct timer_list *timer) __mod_timer(timer, timer->expires); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) + extern int timer_pending_sync(struct timer_list *timer); extern int try_to_del_timer_sync(struct timer_list *timer); extern int del_timer_sync(struct timer_list *timer); #else +# define timer_pending_sync(t) timer_pending(t) # define try_to_del_timer_sync(t) del_timer(t) # define del_timer_sync(t) del_timer(t) #endif diff --git a/init/main.c b/init/main.c index 7e117a2..f4e9cf4 100644 --- a/init/main.c +++ b/init/main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -588,8 +590,10 @@ asmlinkage void __init start_kernel(void) * fragile until we cpu_idle() for the first time. */ preempt_disable(); + build_all_zonelists(); page_alloc_init(); + early_init_hardirqs(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, @@ -854,6 +858,8 @@ static int __init kernel_init(void * unused) smp_prepare_cpus(setup_max_cpus); + init_hardirqs(); + do_pre_smp_initcalls(); start_boot_trace(); @@ -884,5 +890,7 @@ static int __init kernel_init(void * unused) */ stop_boot_trace(); init_post(); + WARN_ON(debug_direct_keyboard); + return 0; } diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 9fdba03..ba94f55 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -77,3 +77,39 @@ config RCU_TRACE Say Y here if you want to enable RCU tracing Say N if you are unsure. + +config PREEMPT_SOFTIRQS + bool "Thread Softirqs" + default n +# depends on PREEMPT + help + This option reduces the latency of the kernel by 'threading' + soft interrupts. This means that all softirqs will execute + in softirqd's context. While this helps latency, it can also + reduce performance. + + The threading of softirqs can also be controlled via + /proc/sys/kernel/softirq_preemption runtime flag and the + sofirq-preempt=0/1 boot-time option. + + Say N if you are unsure. + +config PREEMPT_HARDIRQS + bool "Thread Hardirqs" + default n + depends on !GENERIC_HARDIRQS_NO__DO_IRQ + select PREEMPT_SOFTIRQS + help + This option reduces the latency of the kernel by 'threading' + hardirqs. This means that all (or selected) hardirqs will run + in their own kernel thread context. While this helps latency, + this feature can also reduce performance. + + The threading of hardirqs can also be controlled via the + /proc/sys/kernel/hardirq_preemption runtime flag and the + hardirq-preempt=0/1 boot-time option. Per-irq threading can + be enabled/disable via the /proc/irq///threaded + runtime flags. + + Say N if you are unsure. + diff --git a/kernel/fork.c b/kernel/fork.c index 495da2e..e5b554e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include @@ -928,6 +929,7 @@ static void posix_cpu_timers_init(struct task_struct *tsk) INIT_LIST_HEAD(&tsk->cpu_timers[0]); INIT_LIST_HEAD(&tsk->cpu_timers[1]); INIT_LIST_HEAD(&tsk->cpu_timers[2]); + tsk->posix_timer_list = NULL; } /* diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 47e6334..d060bf1 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1073,7 +1073,7 @@ int hrtimer_cancel(struct hrtimer *timer) if (ret >= 0) return ret; - cpu_relax(); + hrtimer_wait_for_timer(timer); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); @@ -1192,6 +1192,32 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); +#ifdef CONFIG_PREEMPT_SOFTIRQS +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) + +/** + * hrtimer_wait_for_timer - Wait for a running timer + * + * @timer: timer to wait for + * + * The function waits in case the timers callback function is + * currently executed on the waitqueue of the timer base. The + * waitqueue is woken up after the timer callback function has + * finished execution. + */ +void hrtimer_wait_for_timer(const struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer->base; + + if (base && base->cpu_base) + wait_event(base->cpu_base->wait, + !(timer->state & HRTIMER_STATE_CALLBACK)); +} + +#else +# define wake_up_timer_waiters(b) do { } while (0) +#endif + static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) { spin_lock_irq(&cpu_base->lock); @@ -1259,6 +1285,8 @@ static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) } } spin_unlock_irq(&cpu_base->lock); + + wake_up_timer_waiters(cpu_base); } static void __run_hrtimer(struct hrtimer *timer) @@ -1657,6 +1685,9 @@ static void __cpuinit init_hrtimers_cpu(int cpu) INIT_LIST_HEAD(&cpu_base->cb_pending); hrtimer_init_hres(cpu_base); +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&cpu_base->wait); +#endif } #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index cc0f732..5e5be5a 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 10b5092..e8aa31c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -287,8 +287,10 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) if (desc->chip->mask_ack) desc->chip->mask_ack(irq); else { - desc->chip->mask(irq); - desc->chip->ack(irq); + if (desc->chip->mask) + desc->chip->mask(irq); + if (desc->chip->ack) + desc->chip->ack(irq); } } @@ -312,8 +314,10 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->status & IRQ_INPROGRESS)) { + desc->status |= IRQ_PENDING; goto out_unlock; + } desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -322,6 +326,11 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); @@ -367,6 +376,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; desc->status |= IRQ_INPROGRESS; + + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; + spin_unlock(&desc->lock); action_ret = handle_IRQ_event(irq, action); @@ -418,6 +434,15 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) } desc->status |= IRQ_INPROGRESS; + /* + * In the threaded case we fall back to a mask+eoi sequence: + */ + if (redirect_hardirq(desc)) { + if (desc->chip->mask) + desc->chip->mask(irq); + goto out; + } + desc->status &= ~IRQ_PENDING; spin_unlock(&desc->lock); @@ -429,7 +454,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); - spin_unlock(&desc->lock); } @@ -475,6 +499,12 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; + /* + * hardirq redirection to the irqd process context: + */ + if (redirect_hardirq(desc)) + goto out_unlock; + do { struct irqaction *action = desc->action; irqreturn_t action_ret; @@ -575,6 +605,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->handle_irq = handle; desc->name = name; + __set_irq_thread_handler(desc, handle); + if (handle != handle_bad_irq && is_chained) { desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c815b42..8cf3aa4 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -135,24 +136,85 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - if (!(action->flags & IRQF_DISABLED)) - local_irq_enable_in_hardirq(); +#ifdef __i386__ + if (debug_direct_keyboard && irq == 1) + lockdep_off(); +#endif + + /* + * Unconditionally enable interrupts for threaded + * IRQ handlers: + */ + if (!hardirq_count() || !(action->flags & IRQF_DISABLED)) + local_irq_enable(); do { + unsigned int preempt_count = preempt_count(); + ret = action->handler(irq, action->dev_id); + if (preempt_count() != preempt_count) { + print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + dump_stack(); + preempt_count() = preempt_count; + } if (ret == IRQ_HANDLED) status |= action->flags; retval |= ret; action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (status & IRQF_SAMPLE_RANDOM) { + local_irq_enable(); add_interrupt_randomness(irq); + } local_irq_disable(); +#ifdef __i386__ + if (debug_direct_keyboard && irq == 1) + lockdep_on(); +#endif return retval; } +/* + * Hack - used for development only. + */ +int __read_mostly debug_direct_keyboard = 0; + +int __init debug_direct_keyboard_setup(char *str) +{ + debug_direct_keyboard = 1; + printk(KERN_INFO "Switching IRQ 1 (keyboard) to to direct!\n"); +#ifdef CONFIG_PREEMPT_RT + printk(KERN_INFO "WARNING: kernel may easily crash this way!\n"); +#endif + return 1; +} + +__setup("debug_direct_keyboard", debug_direct_keyboard_setup); + +int redirect_hardirq(struct irq_desc *desc) +{ + /* + * Direct execution: + */ + if (!hardirq_preemption || (desc->status & IRQ_NODELAY) || + !desc->thread) + return 0; + +#ifdef __i386__ + if (debug_direct_keyboard && (desc - irq_desc == 1)) + return 0; +#endif + + BUG_ON(!irqs_disabled()); + if (desc->thread && desc->thread->state != TASK_RUNNING) + wake_up_process(desc->thread); + + return 1; +} + #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ /** * __do_IRQ - original all in one highlevel IRQ handler diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 64c1c72..600751c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,6 +13,10 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); +extern int redirect_hardirq(struct irq_desc *desc); + +void recalculate_desc_flags(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 801addd..776c715 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -8,8 +8,10 @@ */ #include -#include #include +#include +#include +#include #include #include @@ -44,8 +46,12 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); + if (hardirq_preemption && !(desc->status & IRQ_NODELAY)) + wait_event(desc->wait_for_handler, + !(desc->status & IRQ_INPROGRESS)); + else + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ spin_lock_irqsave(&desc->lock, flags); @@ -320,6 +326,21 @@ int set_irq_wake(unsigned int irq, unsigned int on) EXPORT_SYMBOL(set_irq_wake); /* + * If any action has IRQF_NODELAY then turn IRQ_NODELAY on: + */ +void recalculate_desc_flags(struct irq_desc *desc) +{ + struct irqaction *action; + + desc->status &= ~IRQ_NODELAY; + for (action = desc->action ; action; action = action->next) + if (action->flags & IRQF_NODELAY) + desc->status |= IRQ_NODELAY; +} + +static int start_irq_thread(int irq, struct irq_desc *desc); + +/* * Internal function that tells the architecture code whether a * particular irq has been exclusively allocated or is available * for driver use. @@ -420,6 +441,9 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) rand_initialize_irq(irq); } + if (!(new->flags & IRQF_NODELAY)) + if (start_irq_thread(irq, desc)) + return -ENOMEM; /* * The following block of code has to be executed atomically */ @@ -501,6 +525,11 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) *p = new; + /* + * Propagate any possible IRQF_NODELAY flag into IRQ_NODELAY: + */ + recalculate_desc_flags(desc); + /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -518,7 +547,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) new->irq = irq; register_irq_proc(irq, desc); - new->dir = NULL; + new->dir = new->threaded = NULL; register_handler_proc(irq, new); return 0; @@ -603,6 +632,7 @@ void free_irq(unsigned int irq, void *dev_id) else desc->chip->disable(irq); } + recalculate_desc_flags(desc); spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -734,3 +764,267 @@ int request_irq(unsigned int irq, irq_handler_t handler, return retval; } EXPORT_SYMBOL(request_irq); + +#ifdef CONFIG_PREEMPT_HARDIRQS + +int hardirq_preemption = 1; + +EXPORT_SYMBOL(hardirq_preemption); + +static int __init hardirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + hardirq_preemption = 0; + else + get_option(&str, &hardirq_preemption); + if (!hardirq_preemption) + printk("turning off hardirq preemption!\n"); + + return 1; +} + +__setup("hardirq-preempt=", hardirq_preempt_setup); + + +/* + * threaded simple handler + */ +static void thread_simple_irq(irq_desc_t *desc) +{ + struct irqaction *action = desc->action; + unsigned int irq = desc - irq_desc; + irqreturn_t action_ret; + + do { + if (!action || desc->depth) + break; + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + cond_resched_hardirq_context(); + spin_lock(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while (desc->status & IRQ_PENDING); + desc->status &= ~IRQ_INPROGRESS; +} + +/* + * threaded level type irq handler + */ +static void thread_level_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + thread_simple_irq(desc); + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); +} + +/* + * threaded fasteoi type irq handler + */ +static void thread_fasteoi_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + thread_simple_irq(desc); + if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) + desc->chip->unmask(irq); +} + +/* + * threaded edge type IRQ handler + */ +static void thread_edge_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->status &= ~IRQ_INPROGRESS; + desc->chip->mask(irq); + return; + } + + /* + * When another irq arrived while we were handling + * one, we could have masked the irq. + * Renable it, if it was not disabled in meantime. + */ + if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) == + (IRQ_PENDING | IRQ_MASKED)) && !desc->depth)) + desc->chip->unmask(irq); + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + cond_resched_hardirq_context(); + spin_lock(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while ((desc->status & IRQ_PENDING) && !desc->depth); + + desc->status &= ~IRQ_INPROGRESS; +} + +/* + * threaded edge type IRQ handler + */ +static void thread_do_irq(irq_desc_t *desc) +{ + unsigned int irq = desc - irq_desc; + + do { + struct irqaction *action = desc->action; + irqreturn_t action_ret; + + if (unlikely(!action)) { + desc->status &= ~IRQ_INPROGRESS; + desc->chip->disable(irq); + return; + } + + desc->status &= ~IRQ_PENDING; + spin_unlock(&desc->lock); + action_ret = handle_IRQ_event(irq, action); + cond_resched_hardirq_context(); + spin_lock(&desc->lock); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + } while ((desc->status & IRQ_PENDING) && !desc->depth); + + desc->status &= ~IRQ_INPROGRESS; + desc->chip->end(irq); +} + +void +__set_irq_thread_handler(struct irq_desc *desc, irq_flow_handler_t handle) +{ + if (handle == handle_simple_irq) + desc->handle_irq_thread = thread_simple_irq; + else if (handle == handle_level_irq) + desc->handle_irq_thread = thread_level_irq; + else if (handle == handle_fasteoi_irq) + desc->handle_irq_thread = thread_fasteoi_irq; + else if (handle == handle_edge_irq) + desc->handle_irq_thread = thread_edge_irq; + else + desc->handle_irq_thread = thread_do_irq; +} + +static void do_hardirq(struct irq_desc *desc) +{ + spin_lock(&desc->lock); + + if (!(desc->status & IRQ_INPROGRESS)) + goto out; + + if (unlikely(!desc->handle_irq_thread)) + thread_do_irq(desc); + else + desc->handle_irq_thread(desc); + out: + spin_unlock(&desc->lock); + + if (waitqueue_active(&desc->wait_for_handler)) + wake_up(&desc->wait_for_handler); +} + +extern asmlinkage void __do_softirq(void); + +static int do_irqd(void * __desc) +{ + struct sched_param param = { 0, }; + struct irq_desc *desc = __desc; + +#ifdef CONFIG_SMP + set_cpus_allowed(current, desc->affinity); +#endif + current->flags |= PF_NOFREEZE | PF_EXT_HARDIRQ; + + /* + * Set irq thread priority to SCHED_FIFO/50: + */ + param.sched_priority = MAX_USER_RT_PRIO/2; + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + + while (!kthread_should_stop()) { + local_irq_disable(); + set_current_state(TASK_INTERRUPTIBLE); + irq_enter(); + do_hardirq(desc); + irq_exit(); + local_irq_enable(); + cond_resched(); +#ifdef CONFIG_SMP + /* + * Did IRQ affinities change? + */ + if (!cpus_equal(current->cpus_allowed, desc->affinity)) + set_cpus_allowed(current, desc->affinity); +#endif + schedule(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +static int ok_to_create_irq_threads; + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + if (desc->thread || !ok_to_create_irq_threads) + return 0; + + desc->thread = kthread_create(do_irqd, desc, "IRQ-%d", irq); + if (!desc->thread) { + printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq); + return -ENOMEM; + } + + /* + * An interrupt may have come in before the thread pointer was + * stored in desc->thread; make sure the thread gets woken up in + * such a case: + */ + smp_mb(); + wake_up_process(desc->thread); + + return 0; +} + +void __init init_hardirqs(void) +{ + int i; + ok_to_create_irq_threads = 1; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc_t *desc = irq_desc + i; + + if (desc->action && !(desc->status & IRQ_NODELAY)) + start_irq_thread(i, desc); + } +} + +#else + +static int start_irq_thread(int irq, struct irq_desc *desc) +{ + return 0; +} + +#endif + +void __init early_init_hardirqs(void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) + init_waitqueue_head(&irq_desc[i].wait_for_handler); +} diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d257e7d..143a25e 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -7,6 +7,8 @@ */ #include +#include +#include #include #include #include @@ -141,45 +143,6 @@ static int irq_spurious_read(char *page, char **start, off_t off, jiffies_to_msecs(desc->last_unhandled)); } -#define MAX_NAMELEN 128 - -static int name_unique(unsigned int irq, struct irqaction *new_action) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { - if ((action != new_action) && action->name && - !strcmp(new_action->name, action->name)) { - ret = 0; - break; - } - } - spin_unlock_irqrestore(&desc->lock, flags); - return ret; -} - -void register_handler_proc(unsigned int irq, struct irqaction *action) -{ - char name [MAX_NAMELEN]; - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc->dir || action->dir || !action->name || - !name_unique(irq, action)) - return; - - memset(name, 0, MAX_NAMELEN); - snprintf(name, MAX_NAMELEN, "%s", action->name); - - /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, desc->dir); -} - -#undef MAX_NAMELEN - #define MAX_NAMELEN 10 void register_irq_proc(unsigned int irq, struct irq_desc *desc) @@ -213,6 +176,8 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { + if (action->threaded) + remove_proc_entry(action->threaded->name, action->dir); if (action->dir) { struct irq_desc *desc = irq_to_desc(irq); @@ -220,6 +185,90 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) } } +#ifndef CONFIG_PREEMPT_RT + +static int threaded_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + return sprintf(page, "%c\n", + ((struct irqaction *)data)->flags & IRQF_NODELAY ? '0' : '1'); +} + +static int threaded_write_proc(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + int c; + struct irqaction *action = data; + irq_desc_t *desc = irq_desc + action->irq; + + if (get_user(c, buffer)) + return -EFAULT; + if (c != '0' && c != '1') + return -EINVAL; + + spin_lock_irq(&desc->lock); + + if (c == '0') + action->flags |= IRQF_NODELAY; + if (c == '1') + action->flags &= ~IRQF_NODELAY; + recalculate_desc_flags(desc); + + spin_unlock_irq(&desc->lock); + + return 1; +} + +#endif + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_desc + irq; + struct irqaction *action; + + for (action = desc->action ; action; action = action->next) + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) + return 0; + return 1; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + + if (!irq_desc[irq].dir || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, irq_desc[irq].dir); + + if (!action->dir) + return; +#ifndef CONFIG_PREEMPT_RT + { + struct proc_dir_entry *entry; + /* create /proc/irq/1234/handler/threaded */ + entry = create_proc_entry("threaded", 0600, action->dir); + if (!entry) + return; + entry->nlink = 1; + entry->data = (void *)action; + entry->read_proc = threaded_read_proc; + entry->write_proc = threaded_write_proc; + action->threaded = entry; + } +#endif +} + +#undef MAX_NAMELEN + static void register_default_affinity_proc(void) { #ifdef CONFIG_SMP @@ -240,6 +289,9 @@ void init_irq_proc(void) register_default_affinity_proc(); + /* create /proc/irq/prof_cpu_mask */ + create_prof_cpu_mask(root_irq_dir); + /* * Create entries for all existing IRQs. */ diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd364c1..f223169 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -14,6 +14,11 @@ #include #include +#ifdef CONFIG_X86_IO_APIC +# include +# include +#endif + static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) @@ -246,6 +251,12 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * The interrupt is stuck */ __report_bad_irq(irq, desc, action_ret); +#ifdef CONFIG_X86_IO_APIC + if (!sis_apic_bug) { + sis_apic_bug = 1; + printk(KERN_ERR "turning off IO-APIC fast mode.\n"); + } +#else /* * Now kill the IRQ */ @@ -256,6 +267,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); +#endif } desc->irqs_unhandled = 0; } diff --git a/kernel/itimer.c b/kernel/itimer.c index db7c358..d51ff79 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -161,6 +161,7 @@ again: /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { spin_unlock_irq(&tsk->sighand->siglock); + hrtimer_wait_for_timer(&tsk->signal->real_timer); goto again; } expires = timeval_to_ktime(value->it_value); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 4e5288a..4f966ee 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -562,7 +562,7 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); spin_lock(&p->sighand->siglock); listpos = head; @@ -723,7 +723,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, /* * Disarm any old timer after extracting its expiry time. */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); ret = 0; spin_lock(&p->sighand->siglock); @@ -1341,12 +1341,11 @@ static inline int fastpath_timer_check(struct task_struct *tsk) * already updated our counts. We need to check if any timers fire now. * Interrupts are disabled. */ -void run_posix_cpu_timers(struct task_struct *tsk) +void __run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; - BUG_ON(!irqs_disabled()); /* * The fast path checks that there are no expired thread or thread @@ -1398,6 +1397,168 @@ void run_posix_cpu_timers(struct task_struct *tsk) } } +#include +#include +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); + +static int posix_cpu_timers_thread(void *data) +{ + int cpu = (long)data; + + BUG_ON(per_cpu(posix_timer_task, cpu) != current); + + + while (!kthread_should_stop()) { + struct task_struct *tsk = NULL; + struct task_struct *next = NULL; + + if (cpu_is_offline(cpu)) + goto wait_to_die; + + /* grab task list */ + raw_local_irq_disable(); + tsk = per_cpu(posix_timer_tasklist, cpu); + per_cpu(posix_timer_tasklist, cpu) = NULL; + raw_local_irq_enable(); + + + /* its possible the list is empty, just return */ + if (!tsk) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + continue; + } + + /* Process task list */ + while (1) { + /* save next */ + next = tsk->posix_timer_list; + + /* run the task timers, clear its ptr and + * unreference it + */ + __run_posix_cpu_timers(tsk); + tsk->posix_timer_list = NULL; + put_task_struct(tsk); + + /* check if this is the last on the list */ + if (next == tsk) + break; + tsk = next; + } + } + return 0; + +wait_to_die: + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +void run_posix_cpu_timers(struct task_struct *tsk) +{ + unsigned long cpu = smp_processor_id(); + struct task_struct *tasklist; + + BUG_ON(!irqs_disabled()); + if (!per_cpu(posix_timer_task, cpu)) + return; + /* get per-cpu references */ + tasklist = per_cpu(posix_timer_tasklist, cpu); + + /* check to see if we're already queued */ + if (!tsk->posix_timer_list) { + get_task_struct(tsk); + if (tasklist) { + tsk->posix_timer_list = tasklist; + } else { + /* + * The list is terminated by a self-pointing + * task_struct + */ + tsk->posix_timer_list = tsk; + } + per_cpu(posix_timer_tasklist, cpu) = tsk; + } + /* XXX signal the thread somehow */ + wake_up_process(per_cpu(posix_timer_task, cpu)); +} + + + + +/* + * posix_cpu_thread_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int posix_cpu_thread_call(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + struct task_struct *p; + struct sched_param param; + + switch (action) { + case CPU_UP_PREPARE: + p = kthread_create(posix_cpu_timers_thread, hcpu, + "posixcputmr/%d", cpu); + if (IS_ERR(p)) + return NOTIFY_BAD; + p->flags |= PF_NOFREEZE; + kthread_bind(p, cpu); + /* Must be high prio to avoid getting starved */ + param.sched_priority = MAX_RT_PRIO-1; + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(posix_timer_task, cpu) = p; + break; + case CPU_ONLINE: + /* Strictly unneccessary, as first user will wake it. */ + wake_up_process(per_cpu(posix_timer_task, cpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind it from offline cpu so it can run. Fall thru. */ + kthread_bind(per_cpu(posix_timer_task, cpu), + any_online_cpu(cpu_online_map)); + kthread_stop(per_cpu(posix_timer_task, cpu)); + per_cpu(posix_timer_task, cpu) = NULL; + break; + case CPU_DEAD: + kthread_stop(per_cpu(posix_timer_task, cpu)); + per_cpu(posix_timer_task, cpu) = NULL; + break; +#endif + } + return NOTIFY_OK; +} + +/* Register at highest priority so that task migration (migrate_all_tasks) + * happens before everything else. + */ +static struct notifier_block __devinitdata posix_cpu_thread_notifier = { + .notifier_call = posix_cpu_thread_call, + .priority = 10 +}; + +int __init posix_cpu_thread_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + /* Start one for boot CPU. */ + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_UP_PREPARE, cpu); + posix_cpu_thread_call(&posix_cpu_thread_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&posix_cpu_thread_notifier); + return 0; +} + + + /* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. @@ -1663,6 +1824,12 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + unsigned long cpu; + + /* init the per-cpu posix_timer_tasklets */ + for_each_cpu_mask(cpu, cpu_possible_map) { + per_cpu(posix_timer_tasklist, cpu) = NULL; + } register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a140e44..eb55aff 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -788,6 +788,7 @@ retry: unlock_timer(timr, flag); if (error == TIMER_RETRY) { + hrtimer_wait_for_timer(&timr->it.real.timer); rtn = NULL; // We already got the old time... goto retry; } @@ -827,6 +828,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } @@ -857,6 +859,7 @@ retry_delete: if (timer_delete_hook(timer) == TIMER_RETRY) { unlock_timer(timer, flags); + hrtimer_wait_for_timer(&timer->it.real.timer); goto retry_delete; } list_del(&timer->list); diff --git a/kernel/sched.c b/kernel/sched.c index e4bb1dd..0aef87b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4165,9 +4165,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (hardirq_count() - hardirq_offset) + if (hardirq_count() - hardirq_offset || (p->flags & PF_EXT_HARDIRQ)) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (softirq_count() || (p->flags & PF_EXT_SOFTIRQ)) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else if (p != rq->idle) cpustat->system = cputime64_add(cpustat->system, tmp); @@ -4551,7 +4551,7 @@ asmlinkage void __sched preempt_schedule_irq(void) struct thread_info *ti = current_thread_info(); /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + WARN_ON_ONCE(ti->preempt_count || !irqs_disabled()); do { add_preempt_count(PREEMPT_ACTIVE); @@ -5619,9 +5619,14 @@ int cond_resched_lock(spinlock_t *lock) } EXPORT_SYMBOL(cond_resched_lock); +/* + * Voluntarily preempt a process context that has softirqs disabled: + */ int __sched cond_resched_softirq(void) { - BUG_ON(!in_softirq()); +#ifndef CONFIG_PREEMPT_SOFTIRQS + WARN_ON_ONCE(!in_softirq()); +#endif if (need_resched() && system_state == SYSTEM_RUNNING) { local_bh_enable(); @@ -5633,6 +5638,47 @@ int __sched cond_resched_softirq(void) } EXPORT_SYMBOL(cond_resched_softirq); +/* + * Voluntarily preempt a softirq context (possible with softirq threading): + */ +int __sched cond_resched_softirq_context(void) +{ +#ifndef CONFIG_PREEMPT_SOFTIRQS + WARN_ON_ONCE(!in_softirq()); +#endif + if (softirq_need_resched() && system_state == SYSTEM_RUNNING) { + raw_local_irq_disable(); + _local_bh_enable(); + raw_local_irq_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_softirq_context); + +/* + * Preempt a hardirq context if necessary (possible with hardirq threading): + */ +int cond_resched_hardirq_context(void) +{ + WARN_ON_ONCE(!in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (hardirq_need_resched()) { + irq_exit(); + local_irq_enable(); + __cond_resched(); + local_irq_disable(); + __irq_enter(); + + return 1; + } + return 0; +} +EXPORT_SYMBOL(cond_resched_hardirq_context); + /** * yield - yield the current processor to other threads. * diff --git a/kernel/softirq.c b/kernel/softirq.c index e7c69a7..03b49f6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -8,9 +8,16 @@ * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) * * Remote softirq infrastructure is by Jens Axboe. + * + * + * Softirq-split implemetation by + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar */ #include +#include +#include +#include #include #include #include @@ -50,7 +57,41 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +struct softirqdata { + int nr; + unsigned long cpu; + struct task_struct *tsk; +#ifdef CONFIG_PREEMPT_SOFTIRQS + wait_queue_head_t wait; + int running; +#endif +}; + +static DEFINE_PER_CPU(struct softirqdata [NR_SOFTIRQS], ksoftirqd); + +#ifdef CONFIG_PREEMPT_SOFTIRQS +/* + * Preempting the softirq causes cases that would not be a + * problem when the softirq is not preempted. That is a + * process may have code to spin while waiting for a softirq + * to finish on another CPU. But if it happens that the + * process has preempted the softirq, this could cause a + * deadlock. + */ +void wait_for_softirq(int softirq) +{ + struct softirqdata *data = &__get_cpu_var(ksoftirqd)[softirq]; + if (data->running) { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&data->wait, &wait); + if (data->running) + schedule(); + remove_wait_queue(&data->wait, &wait); + __set_current_state(TASK_RUNNING); + } +} +#endif /* * we cannot loop indefinitely here to avoid userspace starvation, @@ -58,16 +99,32 @@ static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -static inline void wakeup_softirqd(void) +static void wakeup_softirqd(int softirq) { /* Interrupts are disabled: no need to stop preemption */ - struct task_struct *tsk = __get_cpu_var(ksoftirqd); + struct task_struct *tsk = __get_cpu_var(ksoftirqd)[softirq].tsk; if (tsk && tsk->state != TASK_RUNNING) wake_up_process(tsk); } /* + * Wake up the softirq threads which have work + */ +static void trigger_softirqs(void) +{ + u32 pending = local_softirq_pending(); + int curr = 0; + + while (pending) { + if (pending & 1) + wakeup_softirqd(curr); + pending >>= 1; + curr++; + } +} + +/* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ @@ -102,20 +159,6 @@ void local_bh_disable(void) EXPORT_SYMBOL(local_bh_disable); -void __local_bh_enable(void) -{ - WARN_ON_ONCE(in_irq()); - - /* - * softirqs should never be enabled by __local_bh_enable(), - * it always nests inside local_bh_enable() sections: - */ - WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET); - - sub_preempt_count(SOFTIRQ_OFFSET); -} -EXPORT_SYMBOL_GPL(__local_bh_enable); - /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), @@ -183,7 +226,7 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 -asmlinkage void __do_softirq(void) +asmlinkage void ___do_softirq(void) { struct softirq_action *h; __u32 pending; @@ -193,9 +236,6 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); - trace_softirq_enter(); - cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ @@ -220,6 +260,7 @@ restart: } rcu_bh_qsctr_inc(cpu); + cond_resched_softirq_context(); } h++; pending >>= 1; @@ -232,12 +273,34 @@ restart: goto restart; if (pending) - wakeup_softirqd(); + trigger_softirqs(); +} + +asmlinkage void __do_softirq(void) +{ +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * 'preempt harder'. Push all softirq processing off to ksoftirqd. + */ + if (softirq_preemption) { + if (local_softirq_pending()) + trigger_softirqs(); + return; + } +#endif + /* + * 'immediate' softirq execution: + */ + __local_bh_disable((unsigned long)__builtin_return_address(0)); + trace_softirq_enter(); + + ___do_softirq(); trace_softirq_exit(); account_system_vtime(current); _local_bh_enable(); + } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -307,19 +370,11 @@ void irq_exit(void) */ inline void raise_softirq_irqoff(unsigned int nr) { - __raise_softirq_irqoff(nr); + __do_raise_softirq_irqoff(nr); - /* - * If we're in an interrupt or softirq, we're done - * (this also catches softirq-disabled code). We will - * actually run the softirq once we return from - * the irq or softirq. - * - * Otherwise we wake up ksoftirqd to make sure we - * schedule the softirq soon. - */ - if (!in_interrupt()) - wakeup_softirqd(); +#ifdef CONFIG_PREEMPT_SOFTIRQS + wakeup_softirqd(nr); +#endif } void raise_softirq(unsigned int nr) @@ -346,15 +401,45 @@ struct tasklet_head static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); +static void inline +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) +{ + if (tasklet_trylock(t)) { +again: + /* We may have been preempted before tasklet_trylock + * and __tasklet_action may have already run. + * So double check the sched bit while the takslet + * is locked before adding it to the list. + */ + if (test_bit(TASKLET_STATE_SCHED, &t->state)) { + t->next = NULL; + *head->tail = t; + head->tail = &(t->next); + raise_softirq_irqoff(nr); + tasklet_unlock(t); + } else { + /* This is subtle. If we hit the corner case above + * It is possible that we get preempted right here, + * and another task has successfully called + * tasklet_schedule(), then this function, and + * failed on the trylock. Thus we must be sure + * before releasing the tasklet lock, that the + * SCHED_BIT is clear. Otherwise the tasklet + * may get its SCHED_BIT set, but not added to the + * list + */ + if (!tasklet_tryunlock(t)) + goto again; + } + } +} + void __tasklet_schedule(struct tasklet_struct *t) { unsigned long flags; local_irq_save(flags); - t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); - raise_softirq_irqoff(TASKLET_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_vec), TASKLET_SOFTIRQ); local_irq_restore(flags); } @@ -365,50 +450,120 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) unsigned long flags; local_irq_save(flags); - t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); - raise_softirq_irqoff(HI_SOFTIRQ); + __tasklet_common_schedule(t, &__get_cpu_var(tasklet_hi_vec), HI_SOFTIRQ); local_irq_restore(flags); } EXPORT_SYMBOL(__tasklet_hi_schedule); -static void tasklet_action(struct softirq_action *a) +void tasklet_enable(struct tasklet_struct *t) { - struct tasklet_struct *list; + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_schedule(t); +} - local_irq_disable(); - list = __get_cpu_var(tasklet_vec).head; - __get_cpu_var(tasklet_vec).head = NULL; - __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; - local_irq_enable(); +EXPORT_SYMBOL(tasklet_enable); + +void tasklet_hi_enable(struct tasklet_struct *t) +{ + if (!atomic_dec_and_test(&t->count)) + return; + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) + tasklet_hi_schedule(t); +} + +EXPORT_SYMBOL(tasklet_hi_enable); + +static void +__tasklet_action(struct softirq_action *a, struct tasklet_struct *list) +{ + int loops = 1000000; while (list) { struct tasklet_struct *t = list; list = list->next; - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); + /* + * Should always succeed - after a tasklist got on the + * list (after getting the SCHED bit set from 0 to 1), + * nothing but the tasklet softirq it got queued to can + * lock it: + */ + if (!tasklet_trylock(t)) { + WARN_ON(1); + continue; } - local_irq_disable(); t->next = NULL; - *__get_cpu_var(tasklet_vec).tail = t; - __get_cpu_var(tasklet_vec).tail = &(t->next); - __raise_softirq_irqoff(TASKLET_SOFTIRQ); - local_irq_enable(); + + /* + * If we cannot handle the tasklet because it's disabled, + * mark it as pending. tasklet_enable() will later + * re-schedule the tasklet. + */ + if (unlikely(atomic_read(&t->count))) { +out_disabled: + /* implicit unlock: */ + wmb(); + t->state = TASKLET_STATEF_PENDING; + continue; + } + + /* + * After this point on the tasklet might be rescheduled + * on another CPU, but it can only be added to another + * CPU's tasklet list if we unlock the tasklet (which we + * dont do yet). + */ + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + WARN_ON(1); + +again: + t->func(t->data); + + /* + * Try to unlock the tasklet. We must use cmpxchg, because + * another CPU might have scheduled or disabled the tasklet. + * We only allow the STATE_RUN -> 0 transition here. + */ + while (!tasklet_tryunlock(t)) { + /* + * If it got disabled meanwhile, bail out: + */ + if (atomic_read(&t->count)) + goto out_disabled; + /* + * If it got scheduled meanwhile, re-execute + * the tasklet function: + */ + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + goto again; + if (!--loops) { + printk("hm, tasklet state: %08lx\n", t->state); + WARN_ON(1); + tasklet_unlock(t); + break; + } + } } } +static void tasklet_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + local_irq_disable(); + list = __get_cpu_var(tasklet_vec).head; + __get_cpu_var(tasklet_vec).head = NULL; + __get_cpu_var(tasklet_vec).tail = &__get_cpu_var(tasklet_vec).head; + local_irq_enable(); + + __tasklet_action(a, list); +} + static void tasklet_hi_action(struct softirq_action *a) { struct tasklet_struct *list; @@ -419,29 +574,7 @@ static void tasklet_hi_action(struct softirq_action *a) __get_cpu_var(tasklet_hi_vec).tail = &__get_cpu_var(tasklet_hi_vec).head; local_irq_enable(); - while (list) { - struct tasklet_struct *t = list; - - list = list->next; - - if (tasklet_trylock(t)) { - if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) - BUG(); - t->func(t->data); - tasklet_unlock(t); - continue; - } - tasklet_unlock(t); - } - - local_irq_disable(); - t->next = NULL; - *__get_cpu_var(tasklet_hi_vec).tail = t; - __get_cpu_var(tasklet_hi_vec).tail = &(t->next); - __raise_softirq_irqoff(HI_SOFTIRQ); - local_irq_enable(); - } + __tasklet_action(a, list); } @@ -615,13 +748,24 @@ void __init softirq_init(void) open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -static int ksoftirqd(void * __bind_cpu) +static int ksoftirqd(void * __data) { + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2 }; + struct softirqdata *data = __data; + u32 mask = (1 << data->nr); + struct softirq_action *h; + +#ifdef CONFIG_PREEMPT_SOFTIRQS + init_waitqueue_head(&data->wait); +#endif + + sys_sched_setscheduler(current->pid, SCHED_FIFO, ¶m); + current->flags |= PF_EXT_SOFTIRQ; set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { preempt_disable(); - if (!local_softirq_pending()) { + if (!(local_softirq_pending() & mask)) { preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -629,19 +773,41 @@ static int ksoftirqd(void * __bind_cpu) __set_current_state(TASK_RUNNING); - while (local_softirq_pending()) { +#ifdef CONFIG_PREEMPT_SOFTIRQS + data->running = 1; +#endif + + while (local_softirq_pending() & mask) { /* Preempt disable stops cpu going offline. If already offline, we'll be on wrong CPU: don't process */ - if (cpu_is_offline((long)__bind_cpu)) + if (cpu_is_offline(data->cpu)) goto wait_to_die; - do_softirq(); + + local_irq_disable(); preempt_enable_no_resched(); + set_softirq_pending(local_softirq_pending() & ~mask); + local_bh_disable(); + local_irq_enable(); + + h = &softirq_vec[data->nr]; + if (h) + h->action(h); + rcu_bh_qsctr_inc(data->cpu); + + local_irq_disable(); + _local_bh_enable(); + local_irq_enable(); + cond_resched(); preempt_disable(); } preempt_enable(); set_current_state(TASK_INTERRUPTIBLE); +#ifdef CONFIG_PREEMPT_SOFTIRQS + data->running = 0; + wake_up(&data->wait); +#endif } __set_current_state(TASK_RUNNING); return 0; @@ -691,7 +857,7 @@ void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) BUG(); } -static void takeover_tasklets(unsigned int cpu) +void takeover_tasklets(unsigned int cpu) { /* CPU is dead, so no lock needed. */ local_irq_disable(); @@ -717,49 +883,103 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_PREEMPT_SOFTIRQS +static const char *softirq_names [] = +{ + [HI_SOFTIRQ] = "high", + [SCHED_SOFTIRQ] = "sched", + [TIMER_SOFTIRQ] = "timer", + [NET_TX_SOFTIRQ] = "net-tx", + [NET_RX_SOFTIRQ] = "net-rx", + [BLOCK_SOFTIRQ] = "block", + [TASKLET_SOFTIRQ] = "tasklet", +#ifdef CONFIG_HIGH_RES_TIMERS + [HRTIMER_SOFTIRQ] = "hrtimer", +#endif + [RCU_SOFTIRQ] = "rcu", +}; +#endif + static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int hotcpu = (unsigned long)hcpu; + int hotcpu = (unsigned long)hcpu, i; struct task_struct *p; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); - if (IS_ERR(p)) { - printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; +#ifdef CONFIG_PREEMPT_SOFTIRQS + for (i = 0; i < NR_SOFTIRQS; i++) { + per_cpu(ksoftirqd, hotcpu)[i].nr = i; + per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + p = kthread_create(ksoftirqd, + &per_cpu(ksoftirqd, hotcpu)[i], + "sirq-%s/%d", softirq_names[i], + hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd %d (%s) for %i failed\n", i, + softirq_names[i], hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd, hotcpu)[i].tsk = p; + } +#else + p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + kthread_bind(p, hotcpu); + for (i = 0; i < NR_SOFTIRQS; i++) { + per_cpu(ksoftirqd, hotcpu)[i].nr = i; + per_cpu(ksoftirqd, hotcpu)[i].cpu = hotcpu; + per_cpu(ksoftirqd, hotcpu)[i].tsk = p; } - kthread_bind(p, hotcpu); - per_cpu(ksoftirqd, hotcpu) = p; - break; +#endif + break; + break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(ksoftirqd, hotcpu)); +#ifdef CONFIG_PREEMPT_SOFTIRQS + /* + * The softirq thread stays in a TASK_UNINTERRUPTIBLE + * state till somebody uses it. If it is never used + * it can trigger a hung task warning. + */ + for (i = 0; i < NR_SOFTIRQS; i++) + wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk); +#endif break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(ksoftirqd, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(ksoftirqd, hotcpu), - any_online_cpu(cpu_online_map)); +#if 0 + for (i = 0; i < NR_SOFTIRQS; i++) { + if (!per_cpu(ksoftirqd, hotcpu)[i].tsk) + continue; + kthread_bind(per_cpu(ksoftirqd, hotcpu)[i].tsk, + any_online_cpu(cpu_online_map)); + } +#endif case CPU_DEAD: case CPU_DEAD_FROZEN: { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - p = per_cpu(ksoftirqd, hotcpu); - per_cpu(ksoftirqd, hotcpu) = NULL; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_stop(p); + for (i = 0; i < NR_SOFTIRQS; i++) { + p = per_cpu(ksoftirqd, hotcpu)[i].tsk; + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + sched_setscheduler(p, SCHED_FIFO, ¶m); + kthread_stop(p); + } takeover_tasklets(hotcpu); break; - } -#endif /* CONFIG_HOTPLUG_CPU */ } +#endif /* CONFIG_HOTPLUG_CPU */ + } return NOTIFY_OK; } @@ -779,6 +999,29 @@ static __init int spawn_ksoftirqd(void) } early_initcall(spawn_ksoftirqd); + +#ifdef CONFIG_PREEMPT_SOFTIRQS + +int softirq_preemption = 1; + +EXPORT_SYMBOL(softirq_preemption); + +static int __init softirq_preempt_setup (char *str) +{ + if (!strncmp(str, "off", 3)) + softirq_preemption = 0; + else + get_option(&str, &softirq_preemption); + if (!softirq_preemption) + printk("turning off softirq preemption!\n"); + + return 1; +} + +__setup("softirq-preempt=", softirq_preempt_setup); + +#endif + #ifdef CONFIG_SMP /* * Call a function on all processors diff --git a/kernel/timer.c b/kernel/timer.c index dbd50fa..b6bb450 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -69,6 +70,7 @@ struct tvec_root { struct tvec_base { spinlock_t lock; struct timer_list *running_timer; + wait_queue_head_t wait_for_running_timer; unsigned long timer_jiffies; struct tvec_root tv1; struct tvec tv2; @@ -316,9 +318,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative); static inline void set_running_timer(struct tvec_base *base, struct timer_list *timer) { -#ifdef CONFIG_SMP base->running_timer = timer; -#endif } static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) @@ -593,7 +593,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0; + int ret = 0, cpu; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -607,7 +607,8 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) debug_timer_activate(timer); - new_base = __get_cpu_var(tvec_bases); + cpu = raw_smp_processor_id(); + new_base = per_cpu(tvec_bases, cpu); if (base != new_base) { /* @@ -666,6 +667,18 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_unlock_irqrestore(&base->lock, flags); } +/* + * Wait for a running timer + */ +void wait_for_running_timer(struct timer_list *timer) +{ + struct tvec_base *base = timer->base; + + if (base->running_timer == timer) + wait_event(base->wait_for_running_timer, + base->running_timer != timer); +} + /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -736,7 +749,35 @@ int del_timer(struct timer_list *timer) EXPORT_SYMBOL(del_timer); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS) +/* + * This function checks whether a timer is active and not running on any + * CPU. Upon successful (ret >= 0) exit the timer is not queued and the + * handler is not running on any CPU. + * + * It must not be called from interrupt contexts. + */ +int timer_pending_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + ret = 0; + if (timer_pending(timer)) + ret = 1; +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + + /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer do del @@ -793,7 +834,7 @@ int del_timer_sync(struct timer_list *timer) int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; - cpu_relax(); + wait_for_running_timer(timer); } } @@ -839,6 +880,20 @@ static inline void __run_timers(struct tvec_base *base) struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; + if (softirq_need_resched()) { + spin_unlock_irq(&base->lock); + wake_up(&base->wait_for_running_timer); + cond_resched_softirq_context(); + cpu_relax(); + spin_lock_irq(&base->lock); + /* + * We can simply continue after preemption, nobody + * else can touch timer_jiffies so 'index' is still + * valid. Any new jiffy will be taken care of in + * subsequent loops: + */ + } + /* * Cascade timers: */ @@ -866,18 +921,17 @@ static inline void __run_timers(struct tvec_base *base) int preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { - printk(KERN_ERR "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); + print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn); + printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count()); + preempt_count() = preempt_count; } } + set_running_timer(base, NULL); + cond_resched_softirq_context(); spin_lock_irq(&base->lock); } } - set_running_timer(base, NULL); + wake_up(&base->wait_for_running_timer); spin_unlock_irq(&base->lock); } @@ -1044,11 +1098,11 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); + scheduler_tick(); run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); printk_tick(); - scheduler_tick(); run_posix_cpu_timers(p); } @@ -1057,7 +1111,15 @@ void update_process_times(int user_tick) */ static unsigned long count_active_tasks(void) { + /* + * On PREEMPT_SOFITRQS, we are running in the loadavg thread, + * so consider one less running task in these two cases. + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + return (nr_active() - 1) * FIXED_1; +#else return nr_active() * FIXED_1; +#endif } /* @@ -1094,19 +1156,6 @@ static inline void calc_load(unsigned long ticks) } /* - * This function runs timers and the timer-tq in bottom half context. - */ -static void run_timer_softirq(struct softirq_action *h) -{ - struct tvec_base *base = __get_cpu_var(tvec_bases); - - hrtimer_run_pending(); - - if (time_after_eq(jiffies, base->timer_jiffies)) - __run_timers(base); -} - -/* * Called by the local, per-CPU timer interrupt on SMP. */ void run_local_timers(void) @@ -1117,13 +1166,36 @@ void run_local_timers(void) } /* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! + * Time of day handling: + */ +static inline void update_times(void) +{ + static unsigned long last_tick = INITIAL_JIFFIES; + unsigned long ticks, flags; + + write_seqlock_irqsave(&xtime_lock, flags); + ticks = jiffies - last_tick; + if (ticks) { + last_tick += ticks; + update_wall_time(); + calc_load(ticks); + } + write_sequnlock_irqrestore(&xtime_lock, flags); +} + + +/* + * This function runs timers and the timer-tq in bottom half context. */ -static inline void update_times(unsigned long ticks) +static void run_timer_softirq(struct softirq_action *h) { - update_wall_time(); - calc_load(ticks); + struct tvec_base *base = __get_cpu_var(tvec_bases); + + update_times(); + hrtimer_run_pending(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); } /* @@ -1135,7 +1207,6 @@ static inline void update_times(unsigned long ticks) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1469,6 +1540,7 @@ static int __cpuinit init_timers_cpu(int cpu) } spin_lock_init(&base->lock); + init_waitqueue_head(&base->wait_for_running_timer); for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); diff --git a/net/core/dev.c b/net/core/dev.c index 9174c77..6db5893 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1862,9 +1862,13 @@ gso: Either shot noqueue qdisc, it is even simpler 8) */ if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ + int cpu = raw_smp_processor_id(); /* ok because BHs are off */ - if (txq->xmit_lock_owner != cpu) { +#ifdef CONFIG_PREEMPT_SOFTIRQS + (void)cpu; /* threaded softirqs do not use this variable */ +#endif + /* IRQs as threads do not risk recursion. */ + if (THREADED_IRQS || !test_net_xmit_lock_owner(txq, cpu)) { HARD_TX_LOCK(dev, txq, cpu); @@ -2016,6 +2020,23 @@ static void net_tx_action(struct softirq_action *h) head = head->next_sched; root_lock = qdisc_lock(q); + /* + * We are executing in softirq context here, and + * if softirqs are preemptible, we must avoid + * infinite reactivation of the softirq by + * either the tx handler, or by netif_schedule(). + * (it would result in an infinitely looping + * softirq context) + * So we take the spinlock unconditionally. + */ +#ifdef CONFIG_PREEMPT_SOFTIRQS + spin_lock(root_lock); + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + qdisc_run(q); + spin_unlock(root_lock); +#else if (spin_trylock(root_lock)) { smp_mb__before_clear_bit(); clear_bit(__QDISC_STATE_SCHED, @@ -2032,6 +2053,7 @@ static void net_tx_action(struct softirq_action *h) &q->state); } } +#endif } } } @@ -3941,7 +3963,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, { spin_lock_init(&dev_queue->_xmit_lock); netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); - dev_queue->xmit_lock_owner = -1; + clear_net_xmit_lock_owner(dev_queue); } static void netdev_init_queue_locks(struct net_device *dev) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index cdcd16f..a9ea280 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -78,7 +78,7 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, { int ret; - if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { + if (unlikely(test_net_xmit_lock_owner(dev_queue, smp_processor_id()))) { /* * Same CPU holding the lock. It may be a transient * configuration error, when hard_start_xmit() recurses. We