1 /*
2 * Read-Copy Update preempt priority boosting
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright Red Hat Inc, 2007
19 *
20 * Authors: Steven Rostedt <srostedt@redhat.com>
21 *
22 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>.
23 *
24 */
25 #include <linux/sched.h>
26 #include <linux/list.h>
27 #include <linux/spinlock.h>
28 #include <linux/debugfs.h>
29 #include <linux/module.h>
30 #include <linux/syscalls.h>
31 #include <linux/kthread.h>
32
33 DEFINE_RAW_SPINLOCK(rcu_boost_wake_lock);
34 static int rcu_boost_prio = MAX_PRIO; /* Prio to set preempted RCU readers */
35 static long rcu_boost_counter; /* used to keep track of who boosted */
36 static int rcu_preempt_thread_secs = 3; /* Seconds between waking rcupreemptd thread */
37
38 struct rcu_boost_dat {
39 raw_spinlock_t rbs_lock; /* Sync changes to this struct */
40 int rbs_prio; /* CPU copy of rcu_boost_prio */
41 struct list_head rbs_toboost; /* Preempted RCU readers */
42 struct list_head rbs_boosted; /* RCU readers that have been boosted */
43 #ifdef CONFIG_RCU_TRACE
44 /* The rest are for statistics */
45 unsigned long rbs_stat_task_boost_called;
46 unsigned long rbs_stat_task_boosted;
47 unsigned long rbs_stat_boost_called;
48 unsigned long rbs_stat_try_boost;
49 unsigned long rbs_stat_boosted;
50 unsigned long rbs_stat_unboost_called;
51 unsigned long rbs_stat_unboosted;
52 unsigned long rbs_stat_try_boost_readers;
53 unsigned long rbs_stat_boost_readers;
54 unsigned long rbs_stat_try_unboost_readers;
55 unsigned long rbs_stat_unboost_readers;
56 unsigned long rbs_stat_over_taken;
57 #endif /* CONFIG_RCU_TRACE */
58 };
59
60 static DEFINE_PER_CPU(struct rcu_boost_dat, rcu_boost_data);
61 #define RCU_BOOST_ME &__get_cpu_var(rcu_boost_data)
62
63 #ifdef CONFIG_RCU_TRACE
64
65 #define RCUPREEMPT_BOOST_TRACE_BUF_SIZE 4096
66 static char rcupreempt_boost_trace_buf[RCUPREEMPT_BOOST_TRACE_BUF_SIZE];
67
68 static ssize_t rcuboost_read(struct file *filp, char __user *buffer,
69 size_t count, loff_t *ppos)
70 {
71 static DEFINE_MUTEX(mutex);
72 int cnt = 0;
73 int cpu;
74 struct rcu_boost_dat *rbd;
75 ssize_t bcount;
76 unsigned long task_boost_called = 0;
77 unsigned long task_boosted = 0;
78 unsigned long boost_called = 0;
79 unsigned long try_boost = 0;
80 unsigned long boosted = 0;
81 unsigned long unboost_called = 0;
82 unsigned long unboosted = 0;
83 unsigned long try_boost_readers = 0;
84 unsigned long boost_readers = 0;
85 unsigned long try_unboost_readers = 0;
86 unsigned long unboost_readers = 0;
87 unsigned long over_taken = 0;
88
89 mutex_lock(&mutex);
90
91 for_each_online_cpu(cpu) {
92 rbd = &per_cpu(rcu_boost_data, cpu);
93
94 task_boost_called += rbd->rbs_stat_task_boost_called;
95 task_boosted += rbd->rbs_stat_task_boosted;
96 boost_called += rbd->rbs_stat_boost_called;
97 try_boost += rbd->rbs_stat_try_boost;
98 boosted += rbd->rbs_stat_boosted;
99 unboost_called += rbd->rbs_stat_unboost_called;
100 unboosted += rbd->rbs_stat_unboosted;
101 try_boost_readers += rbd->rbs_stat_try_boost_readers;
102 boost_readers += rbd->rbs_stat_boost_readers;
103 try_unboost_readers += rbd->rbs_stat_try_boost_readers;
104 unboost_readers += rbd->rbs_stat_boost_readers;
105 over_taken += rbd->rbs_stat_over_taken;
106 }
107
108 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
109 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
110 "task_boost_called = %ld\n",
111 task_boost_called);
112 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
113 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
114 "task_boosted = %ld\n",
115 task_boosted);
116 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
117 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
118 "boost_called = %ld\n",
119 boost_called);
120 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
121 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
122 "try_boost = %ld\n",
123 try_boost);
124 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
125 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
126 "boosted = %ld\n",
127 boosted);
128 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
129 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
130 "unboost_called = %ld\n",
131 unboost_called);
132 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
133 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
134 "unboosted = %ld\n",
135 unboosted);
136 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
137 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
138 "try_boost_readers = %ld\n",
139 try_boost_readers);
140 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
141 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
142 "boost_readers = %ld\n",
143 boost_readers);
144 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
145 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
146 "try_unboost_readers = %ld\n",
147 try_unboost_readers);
148 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
149 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
150 "unboost_readers = %ld\n",
151 unboost_readers);
152 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
153 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
154 "over_taken = %ld\n",
155 over_taken);
156 cnt += snprintf(&rcupreempt_boost_trace_buf[cnt],
157 RCUPREEMPT_BOOST_TRACE_BUF_SIZE - cnt,
158 "rcu_boost_prio = %d\n",
159 rcu_boost_prio);
160 bcount = simple_read_from_buffer(buffer, count, ppos,
161 rcupreempt_boost_trace_buf, strlen(rcupreempt_boost_trace_buf));
162 mutex_unlock(&mutex);
163
164 return bcount;
165 }
166
167 static struct file_operations rcuboost_fops = {
168 .read = rcuboost_read,
169 };
170
171 static struct dentry *rcuboostdir;
172 int rcu_trace_boost_create(struct dentry *rcudir)
173 {
174 rcuboostdir = debugfs_create_file("rcuboost", 0444, rcudir,
175 NULL, &rcuboost_fops);
176 if (!rcuboostdir)
177 return 0;
178
179 return 1;
180 }
181 EXPORT_SYMBOL_GPL(rcu_trace_boost_create);
182
183 void rcu_trace_boost_destroy(void)
184 {
185 if (rcuboostdir)
186 debugfs_remove(rcuboostdir);
187 rcuboostdir = NULL;
188 }
189 EXPORT_SYMBOL_GPL(rcu_trace_boost_destroy);
190
191 #define RCU_BOOST_TRACE_FUNC_DECL(type) \
192 static void rcu_trace_boost_##type(struct rcu_boost_dat *rbd) \
193 { \
194 rbd->rbs_stat_##type++; \
195 }
196 RCU_BOOST_TRACE_FUNC_DECL(task_boost_called)
197 RCU_BOOST_TRACE_FUNC_DECL(task_boosted)
198 RCU_BOOST_TRACE_FUNC_DECL(boost_called)
199 RCU_BOOST_TRACE_FUNC_DECL(try_boost)
200 RCU_BOOST_TRACE_FUNC_DECL(boosted)
201 RCU_BOOST_TRACE_FUNC_DECL(unboost_called)
202 RCU_BOOST_TRACE_FUNC_DECL(unboosted)
203 RCU_BOOST_TRACE_FUNC_DECL(try_boost_readers)
204 RCU_BOOST_TRACE_FUNC_DECL(boost_readers)
205 RCU_BOOST_TRACE_FUNC_DECL(try_unboost_readers)
206 RCU_BOOST_TRACE_FUNC_DECL(unboost_readers)
207 RCU_BOOST_TRACE_FUNC_DECL(over_taken)
208 #else /* CONFIG_RCU_TRACE */
209 /* These were created by the above macro "RCU_BOOST_TRACE_FUNC_DECL" */
210 # define rcu_trace_boost_task_boost_called(rbd) do { } while (0)
211 # define rcu_trace_boost_task_boosted(rbd) do { } while (0)
212 # define rcu_trace_boost_boost_called(rbd) do { } while (0)
213 # define rcu_trace_boost_try_boost(rbd) do { } while (0)
214 # define rcu_trace_boost_boosted(rbd) do { } while (0)
215 # define rcu_trace_boost_unboost_called(rbd) do { } while (0)
216 # define rcu_trace_boost_unboosted(rbd) do { } while (0)
217 # define rcu_trace_boost_try_boost_readers(rbd) do { } while (0)
218 # define rcu_trace_boost_boost_readers(rbd) do { } while (0)
219 # define rcu_trace_boost_try_unboost_readers(rbd) do { } while (0)
220 # define rcu_trace_boost_unboost_readers(rbd) do { } while (0)
221 # define rcu_trace_boost_over_taken(rbd) do { } while (0)
222 #endif /* CONFIG_RCU_TRACE */
223
224 static inline int rcu_is_boosted(struct task_struct *task)
225 {
226 return !list_empty(&task->rcub_entry);
227 }
228
229 /*
230 * Helper function to boost a task's prio.
231 */
232 static void rcu_boost_task(struct task_struct *task)
233 {
234 WARN_ON(!irqs_disabled());
235 WARN_ON_SMP(!spin_is_locked(&task->pi_lock));
236
237 rcu_trace_boost_task_boost_called(RCU_BOOST_ME);
238
239 if (task->rcu_prio < task->prio) {
240 rcu_trace_boost_task_boosted(RCU_BOOST_ME);
241 task_setprio(task, task->rcu_prio);
242 }
243 }
244
245 /**
246 * __rcu_preepmt_boost - Called by sleeping RCU readers.
247 *
248 * When the RCU read-side critical section is preempted
249 * (or schedules out due to RT mutex)
250 * it places itself onto a list to notify that it is sleeping
251 * while holding a RCU read lock. If there is already a
252 * synchronize_rcu happening, then it will increase its
253 * priority (if necessary).
254 */
255 void __rcu_preempt_boost(void)
256 {
257 struct task_struct *curr = current;
258 struct rcu_boost_dat *rbd;
259 int prio;
260 unsigned long flags;
261
262 WARN_ON(!current->rcu_read_lock_nesting);
263
264 rcu_trace_boost_boost_called(RCU_BOOST_ME);
265
266 /* check to see if we are already boosted */
267 if (unlikely(rcu_is_boosted(curr)))
268 return;
269
270 /*
271 * To keep us from preempting between grabing
272 * the rbd and locking it, we use local_irq_save
273 */
274 local_irq_save(flags);
275 rbd = &__get_cpu_var(rcu_boost_data);
276 spin_lock(&rbd->rbs_lock);
277
278 spin_lock(&curr->pi_lock);
279
280 curr->rcub_rbdp = rbd;
281
282 rcu_trace_boost_try_boost(rbd);
283
284 prio = rt_mutex_getprio(curr);
285
286 if (list_empty(&curr->rcub_entry))
287 list_add_tail(&curr->rcub_entry, &rbd->rbs_toboost);
288 if (prio <= rbd->rbs_prio)
289 goto out;
290
291 rcu_trace_boost_boosted(curr->rcub_rbdp);
292
293 set_rcu_prio(curr, rbd->rbs_prio);
294 rcu_boost_task(curr);
295
296 out:
297 spin_unlock(&curr->pi_lock);
298 spin_unlock_irqrestore(&rbd->rbs_lock, flags);
299 }
300
301 /**
302 * __rcu_preempt_unboost - called when releasing the RCU read lock
303 *
304 * When releasing the RCU read lock, a check is made to see if
305 * the task was preempted. If it was, it removes itself from the
306 * RCU data lists and if necessary, sets its priority back to
307 * normal.
308 */
309 void __rcu_preempt_unboost(void)
310 {
311 struct task_struct *curr = current;
312 struct rcu_boost_dat *rbd;
313 int prio;
314 unsigned long flags;
315
316 rcu_trace_boost_unboost_called(RCU_BOOST_ME);
317
318 /* if not boosted, then ignore */
319 if (likely(!rcu_is_boosted(curr)))
320 return;
321
322 /*
323 * Need to be very careful with NMIs.
324 * If we take the lock and an NMI comes in
325 * and it may try to unboost us if curr->rcub_rbdp
326 * is still set. So we zero it before grabbing the lock.
327 * But this also means that we might be boosted again
328 * so the boosting code needs to be aware of this.
329 */
330 rbd = curr->rcub_rbdp;
331 curr->rcub_rbdp = NULL;
332
333 /*
334 * Now an NMI might have came in after we grab
335 * the below lock. This check makes sure that
336 * the NMI doesn't try grabbing the lock
337 * while we already have it.
338 */
339 if (unlikely(!rbd))
340 return;
341
342 spin_lock_irqsave(&rbd->rbs_lock, flags);
343 /*
344 * It is still possible that an NMI came in
345 * between the "is_boosted" check and setting
346 * the rcu_rbdp to NULL. This would mean that
347 * the NMI already dequeued us.
348 */
349 if (unlikely(!rcu_is_boosted(curr)))
350 goto out;
351
352 list_del_init(&curr->rcub_entry);
353
354 rcu_trace_boost_unboosted(rbd);
355
356 set_rcu_prio(curr, MAX_PRIO);
357
358 spin_lock(&curr->pi_lock);
359 prio = rt_mutex_getprio(curr);
360 task_setprio(curr, prio);
361
362 curr->rcub_rbdp = NULL;
363
364 spin_unlock(&curr->pi_lock);
365 out:
366 spin_unlock_irqrestore(&rbd->rbs_lock, flags);
367 }
368
369 /*
370 * For each rcu_boost_dat structure, update all the tasks that
371 * are on the lists to the priority of the caller of
372 * synchronize_rcu.
373 */
374 static int __rcu_boost_readers(struct rcu_boost_dat *rbd, int prio, unsigned long flags)
375 {
376 struct task_struct *curr = current;
377 struct task_struct *p;
378
379 spin_lock(&rbd->rbs_lock);
380
381 rbd->rbs_prio = prio;
382
383 /*
384 * Move the already boosted readers onto the list and reboost
385 * them.
386 */
387 list_splice_init(&rbd->rbs_boosted,
388 &rbd->rbs_toboost);
389
390 while (!list_empty(&rbd->rbs_toboost)) {
391 p = list_entry(rbd->rbs_toboost.next,
392 struct task_struct, rcub_entry);
393 list_move_tail(&p->rcub_entry,
394 &rbd->rbs_boosted);
395 set_rcu_prio(p, prio);
396 spin_lock(&p->pi_lock);
397 rcu_boost_task(p);
398 spin_unlock(&p->pi_lock);
399
400 /*
401 * Now we release the lock to allow for a higher
402 * priority task to come in and boost the readers
403 * even higher. Or simply to let a higher priority
404 * task to run now.
405 */
406 spin_unlock(&rbd->rbs_lock);
407 spin_unlock_irqrestore(&rcu_boost_wake_lock, flags);
408
409 cpu_relax();
410 spin_lock_irqsave(&rcu_boost_wake_lock, flags);
411 /*
412 * Another task may have taken over.
413 */
414 if (curr->rcu_preempt_counter != rcu_boost_counter) {
415 rcu_trace_boost_over_taken(rbd);
416 return 1;
417 }
418
419 spin_lock(&rbd->rbs_lock);
420 }
421
422 spin_unlock(&rbd->rbs_lock);
423
424 return 0;
425 }
426
427 /**
428 * rcu_boost_readers - called by synchronize_rcu to boost sleeping RCU readers.
429 *
430 * This function iterates over all the per_cpu rcu_boost_data descriptors
431 * and boosts any sleeping (or slept) RCU readers.
432 */
433 void rcu_boost_readers(void)
434 {
435 struct task_struct *curr = current;
436 struct rcu_boost_dat *rbd;
437 unsigned long flags;
438 int prio;
439 int cpu;
440 int ret;
441
442 spin_lock_irqsave(&rcu_boost_wake_lock, flags);
443
444 prio = rt_mutex_getprio(curr);
445
446 rcu_trace_boost_try_boost_readers(RCU_BOOST_ME);
447
448 if (prio >= rcu_boost_prio) {
449 /* already boosted */
450 spin_unlock_irqrestore(&rcu_boost_wake_lock, flags);
451 return;
452 }
453
454 rcu_boost_prio = prio;
455
456 rcu_trace_boost_boost_readers(RCU_BOOST_ME);
457
458 /* Flag that we are the one to unboost */
459 curr->rcu_preempt_counter = ++rcu_boost_counter;
460
461 for_each_online_cpu(cpu) {
462 rbd = &per_cpu(rcu_boost_data, cpu);
463 ret = __rcu_boost_readers(rbd, prio, flags);
464 if (ret)
465 break;
466 }
467
468 spin_unlock_irqrestore(&rcu_boost_wake_lock, flags);
469
470 }
471
472 /**
473 * rcu_unboost_readers - set the boost level back to normal.
474 *
475 * This function DOES NOT change the priority of any RCU reader
476 * that was boosted. The RCU readers do that when they release
477 * the RCU lock. This function only sets the global
478 * rcu_boost_prio to MAX_PRIO so that new RCU readers that sleep
479 * do not increase their priority.
480 */
481 void rcu_unboost_readers(void)
482 {
483 struct rcu_boost_dat *rbd;
484 unsigned long flags;
485 int cpu;
486
487 spin_lock_irqsave(&rcu_boost_wake_lock, flags);
488
489 rcu_trace_boost_try_unboost_readers(RCU_BOOST_ME);
490
491 if (current->rcu_preempt_counter != rcu_boost_counter)
492 goto out;
493
494 rcu_trace_boost_unboost_readers(RCU_BOOST_ME);
495
496 /*
497 * We could also put in something that
498 * would allow other synchronize_rcu callers
499 * of lower priority that are still waiting
500 * to boost the prio.
501 */
502 rcu_boost_prio = MAX_PRIO;
503
504 for_each_online_cpu(cpu) {
505 rbd = &per_cpu(rcu_boost_data, cpu);
506
507 spin_lock(&rbd->rbs_lock);
508 rbd->rbs_prio = rcu_boost_prio;
509 spin_unlock(&rbd->rbs_lock);
510 }
511
512 out:
513 spin_unlock_irqrestore(&rcu_boost_wake_lock, flags);
514 }
515
516 /*
517 * The krcupreemptd wakes up every "rcu_preempt_thread_secs"
518 * seconds at the minimum priority of 1 to do a
519 * synchronize_rcu. This ensures that grace periods finish
520 * and that we do not starve the system. If there are RT
521 * tasks above priority 1 that are hogging the system and
522 * preventing release of memory, then its the fault of the
523 * system designer running RT tasks too aggressively and the
524 * system is flawed regardless.
525 */
526 static int krcupreemptd(void *data)
527 {
528 struct sched_param param = { .sched_priority = 1 };
529 int ret;
530 int prio;
531
532 ret = sched_setscheduler(current, SCHED_FIFO, ¶m);
533 printk("krcupreemptd setsched %d\n", ret);
534 prio = current->prio;
535 printk(" prio = %d\n", prio);
536 set_current_state(TASK_INTERRUPTIBLE);
537
538 while (!kthread_should_stop()) {
539 schedule_timeout(rcu_preempt_thread_secs * HZ);
540
541 __set_current_state(TASK_RUNNING);
542 if (prio != current->prio) {
543 prio = current->prio;
544 printk("krcupreemptd new prio is %d??\n",prio);
545 }
546
547 synchronize_rcu();
548
549 set_current_state(TASK_INTERRUPTIBLE);
550 }
551 __set_current_state(TASK_RUNNING);
552 return 0;
553 }
554
555 int __init rcu_preempt_boost_init(void)
556 {
557 struct rcu_boost_dat *rbd;
558 int cpu;
559
560 for_each_possible_cpu(cpu) {
561 rbd = &per_cpu(rcu_boost_data, cpu);
562
563 spin_lock_init(&rbd->rbs_lock);
564 rbd->rbs_prio = MAX_PRIO;
565 INIT_LIST_HEAD(&rbd->rbs_toboost);
566 INIT_LIST_HEAD(&rbd->rbs_boosted);
567 }
568
569 return 0;
570 }
571
572 static int __init rcu_preempt_start_krcupreemptd(void)
573 {
574 struct task_struct *p;
575
576 p = kthread_create(krcupreemptd, NULL,
577 "krcupreemptd");
578
579 if (IS_ERR(p)) {
580 printk("krcupreemptd failed\n");
581 return NOTIFY_BAD;
582 }
583 wake_up_process(p);
584
585 return 0;
586 }
587
588 __initcall(rcu_preempt_start_krcupreemptd);
589
|
This page was automatically generated by the
LXR engine.
|