1 /*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49 /* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60 */
61
62 #include <linux/mempolicy.h>
63 #include <linux/mm.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
68 #include <linux/mm.h>
69 #include <linux/nodemask.h>
70 #include <linux/gfp.h>
71 #include <linux/slab.h>
72 #include <linux/string.h>
73 #include <linux/module.h>
74 #include <linux/interrupt.h>
75 #include <linux/init.h>
76 #include <linux/compat.h>
77 #include <linux/mempolicy.h>
78 #include <asm/tlbflush.h>
79 #include <asm/uaccess.h>
80
81 static kmem_cache_t *policy_cache;
82 static kmem_cache_t *sn_cache;
83
84 #define PDprintk(fmt...)
85
86 /* Highest zone. An specific allocation for a zone below that is not
87 policied. */
88 static int policy_zone;
89
90 static struct mempolicy default_policy = {
91 .refcnt = ATOMIC_INIT(1), /* never free it */
92 .policy = MPOL_DEFAULT,
93 };
94
95 /* Check if all specified nodes are online */
96 static int nodes_online(unsigned long *nodes)
97 {
98 DECLARE_BITMAP(online2, MAX_NUMNODES);
99
100 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
101 if (bitmap_empty(online2, MAX_NUMNODES))
102 set_bit(0, online2);
103 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
104 return -EINVAL;
105 return 0;
106 }
107
108 /* Do sanity checking on a policy */
109 static int mpol_check_policy(int mode, unsigned long *nodes)
110 {
111 int empty = bitmap_empty(nodes, MAX_NUMNODES);
112
113 switch (mode) {
114 case MPOL_DEFAULT:
115 if (!empty)
116 return -EINVAL;
117 break;
118 case MPOL_BIND:
119 case MPOL_INTERLEAVE:
120 /* Preferred will only use the first bit, but allow
121 more for now. */
122 if (empty)
123 return -EINVAL;
124 break;
125 }
126 return nodes_online(nodes);
127 }
128
129 /* Copy a node mask from user space. */
130 static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
131 unsigned long maxnode, int mode)
132 {
133 unsigned long k;
134 unsigned long nlongs;
135 unsigned long endmask;
136
137 --maxnode;
138 bitmap_zero(nodes, MAX_NUMNODES);
139 if (maxnode == 0 || !nmask)
140 return 0;
141
142 nlongs = BITS_TO_LONGS(maxnode);
143 if ((maxnode % BITS_PER_LONG) == 0)
144 endmask = ~0UL;
145 else
146 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
147
148 /* When the user specified more nodes than supported just check
149 if the non supported part is all zero. */
150 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
151 if (nlongs > PAGE_SIZE/sizeof(long))
152 return -EINVAL;
153 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
154 unsigned long t;
155 if (get_user(t, nmask + k))
156 return -EFAULT;
157 if (k == nlongs - 1) {
158 if (t & endmask)
159 return -EINVAL;
160 } else if (t)
161 return -EINVAL;
162 }
163 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
164 endmask = ~0UL;
165 }
166
167 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
168 return -EFAULT;
169 nodes[nlongs-1] &= endmask;
170 return mpol_check_policy(mode, nodes);
171 }
172
173 /* Generate a custom zonelist for the BIND policy. */
174 static struct zonelist *bind_zonelist(unsigned long *nodes)
175 {
176 struct zonelist *zl;
177 int num, max, nd;
178
179 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
180 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
181 if (!zl)
182 return NULL;
183 num = 0;
184 for (nd = find_first_bit(nodes, MAX_NUMNODES);
185 nd < MAX_NUMNODES;
186 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
187 int k;
188 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
189 struct zone *z = &NODE_DATA(nd)->node_zones[k];
190 if (!z->present_pages)
191 continue;
192 zl->zones[num++] = z;
193 if (k > policy_zone)
194 policy_zone = k;
195 }
196 }
197 BUG_ON(num >= max);
198 zl->zones[num] = NULL;
199 return zl;
200 }
201
202 /* Create a new policy */
203 static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
204 {
205 struct mempolicy *policy;
206
207 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
208 if (mode == MPOL_DEFAULT)
209 return NULL;
210 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
211 if (!policy)
212 return ERR_PTR(-ENOMEM);
213 atomic_set(&policy->refcnt, 1);
214 switch (mode) {
215 case MPOL_INTERLEAVE:
216 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
217 break;
218 case MPOL_PREFERRED:
219 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
220 if (policy->v.preferred_node >= MAX_NUMNODES)
221 policy->v.preferred_node = -1;
222 break;
223 case MPOL_BIND:
224 policy->v.zonelist = bind_zonelist(nodes);
225 if (policy->v.zonelist == NULL) {
226 kmem_cache_free(policy_cache, policy);
227 return ERR_PTR(-ENOMEM);
228 }
229 break;
230 }
231 policy->policy = mode;
232 return policy;
233 }
234
235 /* Ensure all existing pages follow the policy. */
236 static int
237 verify_pages(struct mm_struct *mm,
238 unsigned long addr, unsigned long end, unsigned long *nodes)
239 {
240 while (addr < end) {
241 struct page *p;
242 pte_t *pte;
243 pmd_t *pmd;
244 pud_t *pud;
245 pgd_t *pgd;
246 pgd = pgd_offset(mm, addr);
247 if (pgd_none(*pgd)) {
248 unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
249 if (next > addr)
250 break;
251 addr = next;
252 continue;
253 }
254 pud = pud_offset(pgd, addr);
255 if (pud_none(*pud)) {
256 addr = (addr + PUD_SIZE) & PUD_MASK;
257 continue;
258 }
259 pmd = pmd_offset(pud, addr);
260 if (pmd_none(*pmd)) {
261 addr = (addr + PMD_SIZE) & PMD_MASK;
262 continue;
263 }
264 p = NULL;
265 pte = pte_offset_map(pmd, addr);
266 if (pte_present(*pte))
267 p = pte_page(*pte);
268 pte_unmap(pte);
269 if (p) {
270 unsigned nid = page_to_nid(p);
271 if (!test_bit(nid, nodes))
272 return -EIO;
273 }
274 addr += PAGE_SIZE;
275 }
276 return 0;
277 }
278
279 /* Step 1: check the range */
280 static struct vm_area_struct *
281 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
282 unsigned long *nodes, unsigned long flags)
283 {
284 int err;
285 struct vm_area_struct *first, *vma, *prev;
286
287 first = find_vma(mm, start);
288 if (!first)
289 return ERR_PTR(-EFAULT);
290 prev = NULL;
291 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
292 if (!vma->vm_next && vma->vm_end < end)
293 return ERR_PTR(-EFAULT);
294 if (prev && prev->vm_end < vma->vm_start)
295 return ERR_PTR(-EFAULT);
296 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
297 err = verify_pages(vma->vm_mm,
298 vma->vm_start, vma->vm_end, nodes);
299 if (err) {
300 first = ERR_PTR(err);
301 break;
302 }
303 }
304 prev = vma;
305 }
306 return first;
307 }
308
309 /* Apply policy to a single VMA */
310 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
311 {
312 int err = 0;
313 struct mempolicy *old = vma->vm_policy;
314
315 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
316 vma->vm_start, vma->vm_end, vma->vm_pgoff,
317 vma->vm_ops, vma->vm_file,
318 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
319
320 if (vma->vm_ops && vma->vm_ops->set_policy)
321 err = vma->vm_ops->set_policy(vma, new);
322 if (!err) {
323 mpol_get(new);
324 vma->vm_policy = new;
325 mpol_free(old);
326 }
327 return err;
328 }
329
330 /* Step 2: apply policy to a range and do splits. */
331 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
332 unsigned long end, struct mempolicy *new)
333 {
334 struct vm_area_struct *next;
335 int err;
336
337 err = 0;
338 for (; vma && vma->vm_start < end; vma = next) {
339 next = vma->vm_next;
340 if (vma->vm_start < start)
341 err = split_vma(vma->vm_mm, vma, start, 1);
342 if (!err && vma->vm_end > end)
343 err = split_vma(vma->vm_mm, vma, end, 0);
344 if (!err)
345 err = policy_vma(vma, new);
346 if (err)
347 break;
348 }
349 return err;
350 }
351
352 /* Change policy for a memory range */
353 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
354 unsigned long mode,
355 unsigned long __user *nmask, unsigned long maxnode,
356 unsigned flags)
357 {
358 struct vm_area_struct *vma;
359 struct mm_struct *mm = current->mm;
360 struct mempolicy *new;
361 unsigned long end;
362 DECLARE_BITMAP(nodes, MAX_NUMNODES);
363 int err;
364
365 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
366 return -EINVAL;
367 if (start & ~PAGE_MASK)
368 return -EINVAL;
369 if (mode == MPOL_DEFAULT)
370 flags &= ~MPOL_MF_STRICT;
371 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
372 end = start + len;
373 if (end < start)
374 return -EINVAL;
375 if (end == start)
376 return 0;
377
378 err = get_nodes(nodes, nmask, maxnode, mode);
379 if (err)
380 return err;
381
382 new = mpol_new(mode, nodes);
383 if (IS_ERR(new))
384 return PTR_ERR(new);
385
386 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
387 mode,nodes[0]);
388
389 down_write(&mm->mmap_sem);
390 vma = check_range(mm, start, end, nodes, flags);
391 err = PTR_ERR(vma);
392 if (!IS_ERR(vma))
393 err = mbind_range(vma, start, end, new);
394 up_write(&mm->mmap_sem);
395 mpol_free(new);
396 return err;
397 }
398
399 /* Set the process memory policy */
400 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
401 unsigned long maxnode)
402 {
403 int err;
404 struct mempolicy *new;
405 DECLARE_BITMAP(nodes, MAX_NUMNODES);
406
407 if (mode > MPOL_MAX)
408 return -EINVAL;
409 err = get_nodes(nodes, nmask, maxnode, mode);
410 if (err)
411 return err;
412 new = mpol_new(mode, nodes);
413 if (IS_ERR(new))
414 return PTR_ERR(new);
415 mpol_free(current->mempolicy);
416 current->mempolicy = new;
417 if (new && new->policy == MPOL_INTERLEAVE)
418 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
419 return 0;
420 }
421
422 /* Fill a zone bitmap for a policy */
423 static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
424 {
425 int i;
426
427 bitmap_zero(nodes, MAX_NUMNODES);
428 switch (p->policy) {
429 case MPOL_BIND:
430 for (i = 0; p->v.zonelist->zones[i]; i++)
431 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
432 break;
433 case MPOL_DEFAULT:
434 break;
435 case MPOL_INTERLEAVE:
436 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
437 break;
438 case MPOL_PREFERRED:
439 /* or use current node instead of online map? */
440 if (p->v.preferred_node < 0)
441 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
442 else
443 __set_bit(p->v.preferred_node, nodes);
444 break;
445 default:
446 BUG();
447 }
448 }
449
450 static int lookup_node(struct mm_struct *mm, unsigned long addr)
451 {
452 struct page *p;
453 int err;
454
455 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
456 if (err >= 0) {
457 err = page_to_nid(p);
458 put_page(p);
459 }
460 return err;
461 }
462
463 /* Copy a kernel node mask to user space */
464 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
465 void *nodes, unsigned nbytes)
466 {
467 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
468
469 if (copy > nbytes) {
470 if (copy > PAGE_SIZE)
471 return -EINVAL;
472 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
473 return -EFAULT;
474 copy = nbytes;
475 }
476 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
477 }
478
479 /* Retrieve NUMA policy */
480 asmlinkage long sys_get_mempolicy(int __user *policy,
481 unsigned long __user *nmask,
482 unsigned long maxnode,
483 unsigned long addr, unsigned long flags)
484 {
485 int err, pval;
486 struct mm_struct *mm = current->mm;
487 struct vm_area_struct *vma = NULL;
488 struct mempolicy *pol = current->mempolicy;
489
490 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
491 return -EINVAL;
492 if (nmask != NULL && maxnode < MAX_NUMNODES)
493 return -EINVAL;
494 if (flags & MPOL_F_ADDR) {
495 down_read(&mm->mmap_sem);
496 vma = find_vma_intersection(mm, addr, addr+1);
497 if (!vma) {
498 up_read(&mm->mmap_sem);
499 return -EFAULT;
500 }
501 if (vma->vm_ops && vma->vm_ops->get_policy)
502 pol = vma->vm_ops->get_policy(vma, addr);
503 else
504 pol = vma->vm_policy;
505 } else if (addr)
506 return -EINVAL;
507
508 if (!pol)
509 pol = &default_policy;
510
511 if (flags & MPOL_F_NODE) {
512 if (flags & MPOL_F_ADDR) {
513 err = lookup_node(mm, addr);
514 if (err < 0)
515 goto out;
516 pval = err;
517 } else if (pol == current->mempolicy &&
518 pol->policy == MPOL_INTERLEAVE) {
519 pval = current->il_next;
520 } else {
521 err = -EINVAL;
522 goto out;
523 }
524 } else
525 pval = pol->policy;
526
527 if (vma) {
528 up_read(¤t->mm->mmap_sem);
529 vma = NULL;
530 }
531
532 if (policy && put_user(pval, policy))
533 return -EFAULT;
534
535 err = 0;
536 if (nmask) {
537 DECLARE_BITMAP(nodes, MAX_NUMNODES);
538 get_zonemask(pol, nodes);
539 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
540 }
541
542 out:
543 if (vma)
544 up_read(¤t->mm->mmap_sem);
545 return err;
546 }
547
548 #ifdef CONFIG_COMPAT
549
550 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
551 compat_ulong_t __user *nmask,
552 compat_ulong_t maxnode,
553 compat_ulong_t addr, compat_ulong_t flags)
554 {
555 long err;
556 unsigned long __user *nm = NULL;
557 unsigned long nr_bits, alloc_size;
558 DECLARE_BITMAP(bm, MAX_NUMNODES);
559
560 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
561 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
562
563 if (nmask)
564 nm = compat_alloc_user_space(alloc_size);
565
566 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
567
568 if (!err && nmask) {
569 err = copy_from_user(bm, nm, alloc_size);
570 /* ensure entire bitmap is zeroed */
571 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
572 err |= compat_put_bitmap(nmask, bm, nr_bits);
573 }
574
575 return err;
576 }
577
578 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
579 compat_ulong_t maxnode)
580 {
581 long err = 0;
582 unsigned long __user *nm = NULL;
583 unsigned long nr_bits, alloc_size;
584 DECLARE_BITMAP(bm, MAX_NUMNODES);
585
586 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
587 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
588
589 if (nmask) {
590 err = compat_get_bitmap(bm, nmask, nr_bits);
591 nm = compat_alloc_user_space(alloc_size);
592 err |= copy_to_user(nm, bm, alloc_size);
593 }
594
595 if (err)
596 return -EFAULT;
597
598 return sys_set_mempolicy(mode, nm, nr_bits+1);
599 }
600
601 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
602 compat_ulong_t mode, compat_ulong_t __user *nmask,
603 compat_ulong_t maxnode, compat_ulong_t flags)
604 {
605 long err = 0;
606 unsigned long __user *nm = NULL;
607 unsigned long nr_bits, alloc_size;
608 DECLARE_BITMAP(bm, MAX_NUMNODES);
609
610 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
611 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
612
613 if (nmask) {
614 err = compat_get_bitmap(bm, nmask, nr_bits);
615 nm = compat_alloc_user_space(alloc_size);
616 err |= copy_to_user(nm, bm, alloc_size);
617 }
618
619 if (err)
620 return -EFAULT;
621
622 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
623 }
624
625 #endif
626
627 /* Return effective policy for a VMA */
628 static struct mempolicy *
629 get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
630 {
631 struct mempolicy *pol = current->mempolicy;
632
633 if (vma) {
634 if (vma->vm_ops && vma->vm_ops->get_policy)
635 pol = vma->vm_ops->get_policy(vma, addr);
636 else if (vma->vm_policy &&
637 vma->vm_policy->policy != MPOL_DEFAULT)
638 pol = vma->vm_policy;
639 }
640 if (!pol)
641 pol = &default_policy;
642 return pol;
643 }
644
645 /* Return a zonelist representing a mempolicy */
646 static struct zonelist *zonelist_policy(unsigned gfp, struct mempolicy *policy)
647 {
648 int nd;
649
650 switch (policy->policy) {
651 case MPOL_PREFERRED:
652 nd = policy->v.preferred_node;
653 if (nd < 0)
654 nd = numa_node_id();
655 break;
656 case MPOL_BIND:
657 /* Lower zones don't get a policy applied */
658 if (gfp >= policy_zone)
659 return policy->v.zonelist;
660 /*FALL THROUGH*/
661 case MPOL_INTERLEAVE: /* should not happen */
662 case MPOL_DEFAULT:
663 nd = numa_node_id();
664 break;
665 default:
666 nd = 0;
667 BUG();
668 }
669 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
670 }
671
672 /* Do dynamic interleaving for a process */
673 static unsigned interleave_nodes(struct mempolicy *policy)
674 {
675 unsigned nid, next;
676 struct task_struct *me = current;
677
678 nid = me->il_next;
679 BUG_ON(nid >= MAX_NUMNODES);
680 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
681 if (next >= MAX_NUMNODES)
682 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
683 me->il_next = next;
684 return nid;
685 }
686
687 /* Do static interleaving for a VMA with known offset. */
688 static unsigned offset_il_node(struct mempolicy *pol,
689 struct vm_area_struct *vma, unsigned long off)
690 {
691 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
692 unsigned target = (unsigned)off % nnodes;
693 int c;
694 int nid = -1;
695
696 c = 0;
697 do {
698 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
699 c++;
700 } while (c <= target);
701 BUG_ON(nid >= MAX_NUMNODES);
702 BUG_ON(!test_bit(nid, pol->v.nodes));
703 return nid;
704 }
705
706 /* Allocate a page in interleaved policy.
707 Own path because it needs to do special accounting. */
708 static struct page *alloc_page_interleave(unsigned gfp, unsigned order, unsigned nid)
709 {
710 struct zonelist *zl;
711 struct page *page;
712
713 BUG_ON(!node_online(nid));
714 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
715 page = __alloc_pages(gfp, order, zl);
716 if (page && page_zone(page) == zl->zones[0]) {
717 zl->zones[0]->pageset[get_cpu()].interleave_hit++;
718 put_cpu();
719 }
720 return page;
721 }
722
723 /**
724 * alloc_page_vma - Allocate a page for a VMA.
725 *
726 * @gfp:
727 * %GFP_USER user allocation.
728 * %GFP_KERNEL kernel allocations,
729 * %GFP_HIGHMEM highmem/user allocations,
730 * %GFP_FS allocation should not call back into a file system.
731 * %GFP_ATOMIC don't sleep.
732 *
733 * @vma: Pointer to VMA or NULL if not available.
734 * @addr: Virtual Address of the allocation. Must be inside the VMA.
735 *
736 * This function allocates a page from the kernel page pool and applies
737 * a NUMA policy associated with the VMA or the current process.
738 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
739 * mm_struct of the VMA to prevent it from going away. Should be used for
740 * all allocations for pages that will be mapped into
741 * user space. Returns NULL when no page can be allocated.
742 *
743 * Should be called with the mm_sem of the vma hold.
744 */
745 struct page *
746 alloc_page_vma(unsigned gfp, struct vm_area_struct *vma, unsigned long addr)
747 {
748 struct mempolicy *pol = get_vma_policy(vma, addr);
749
750 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
751 unsigned nid;
752 if (vma) {
753 unsigned long off;
754 BUG_ON(addr >= vma->vm_end);
755 BUG_ON(addr < vma->vm_start);
756 off = vma->vm_pgoff;
757 off += (addr - vma->vm_start) >> PAGE_SHIFT;
758 nid = offset_il_node(pol, vma, off);
759 } else {
760 /* fall back to process interleaving */
761 nid = interleave_nodes(pol);
762 }
763 return alloc_page_interleave(gfp, 0, nid);
764 }
765 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
766 }
767
768 /**
769 * alloc_pages_current - Allocate pages.
770 *
771 * @gfp:
772 * %GFP_USER user allocation,
773 * %GFP_KERNEL kernel allocation,
774 * %GFP_HIGHMEM highmem allocation,
775 * %GFP_FS don't call back into a file system.
776 * %GFP_ATOMIC don't sleep.
777 * @order: Power of two of allocation size in pages. 0 is a single page.
778 *
779 * Allocate a page from the kernel page pool. When not in
780 * interrupt context and apply the current process NUMA policy.
781 * Returns NULL when no page can be allocated.
782 */
783 struct page *alloc_pages_current(unsigned gfp, unsigned order)
784 {
785 struct mempolicy *pol = current->mempolicy;
786
787 if (!pol || in_interrupt())
788 pol = &default_policy;
789 if (pol->policy == MPOL_INTERLEAVE)
790 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
791 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
792 }
793 EXPORT_SYMBOL(alloc_pages_current);
794
795 /* Slow path of a mempolicy copy */
796 struct mempolicy *__mpol_copy(struct mempolicy *old)
797 {
798 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
799
800 if (!new)
801 return ERR_PTR(-ENOMEM);
802 *new = *old;
803 atomic_set(&new->refcnt, 1);
804 if (new->policy == MPOL_BIND) {
805 int sz = ksize(old->v.zonelist);
806 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
807 if (!new->v.zonelist) {
808 kmem_cache_free(policy_cache, new);
809 return ERR_PTR(-ENOMEM);
810 }
811 memcpy(new->v.zonelist, old->v.zonelist, sz);
812 }
813 return new;
814 }
815
816 /* Slow path of a mempolicy comparison */
817 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
818 {
819 if (!a || !b)
820 return 0;
821 if (a->policy != b->policy)
822 return 0;
823 switch (a->policy) {
824 case MPOL_DEFAULT:
825 return 1;
826 case MPOL_INTERLEAVE:
827 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
828 case MPOL_PREFERRED:
829 return a->v.preferred_node == b->v.preferred_node;
830 case MPOL_BIND: {
831 int i;
832 for (i = 0; a->v.zonelist->zones[i]; i++)
833 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
834 return 0;
835 return b->v.zonelist->zones[i] == NULL;
836 }
837 default:
838 BUG();
839 return 0;
840 }
841 }
842
843 /* Slow path of a mpol destructor. */
844 void __mpol_free(struct mempolicy *p)
845 {
846 if (!atomic_dec_and_test(&p->refcnt))
847 return;
848 if (p->policy == MPOL_BIND)
849 kfree(p->v.zonelist);
850 p->policy = MPOL_DEFAULT;
851 kmem_cache_free(policy_cache, p);
852 }
853
854 /*
855 * Hugetlb policy. Same as above, just works with node numbers instead of
856 * zonelists.
857 */
858
859 /* Find first node suitable for an allocation */
860 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
861 {
862 struct mempolicy *pol = get_vma_policy(vma, addr);
863
864 switch (pol->policy) {
865 case MPOL_DEFAULT:
866 return numa_node_id();
867 case MPOL_BIND:
868 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
869 case MPOL_INTERLEAVE:
870 return interleave_nodes(pol);
871 case MPOL_PREFERRED:
872 return pol->v.preferred_node >= 0 ?
873 pol->v.preferred_node : numa_node_id();
874 }
875 BUG();
876 return 0;
877 }
878
879 /* Find secondary valid nodes for an allocation */
880 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
881 {
882 struct mempolicy *pol = get_vma_policy(vma, addr);
883
884 switch (pol->policy) {
885 case MPOL_PREFERRED:
886 case MPOL_DEFAULT:
887 case MPOL_INTERLEAVE:
888 return 1;
889 case MPOL_BIND: {
890 struct zone **z;
891 for (z = pol->v.zonelist->zones; *z; z++)
892 if ((*z)->zone_pgdat->node_id == nid)
893 return 1;
894 return 0;
895 }
896 default:
897 BUG();
898 return 0;
899 }
900 }
901
902 /*
903 * Shared memory backing store policy support.
904 *
905 * Remember policies even when nobody has shared memory mapped.
906 * The policies are kept in Red-Black tree linked from the inode.
907 * They are protected by the sp->lock spinlock, which should be held
908 * for any accesses to the tree.
909 */
910
911 /* lookup first element intersecting start-end */
912 /* Caller holds sp->lock */
913 static struct sp_node *
914 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
915 {
916 struct rb_node *n = sp->root.rb_node;
917
918 while (n) {
919 struct sp_node *p = rb_entry(n, struct sp_node, nd);
920
921 if (start >= p->end)
922 n = n->rb_right;
923 else if (end <= p->start)
924 n = n->rb_left;
925 else
926 break;
927 }
928 if (!n)
929 return NULL;
930 for (;;) {
931 struct sp_node *w = NULL;
932 struct rb_node *prev = rb_prev(n);
933 if (!prev)
934 break;
935 w = rb_entry(prev, struct sp_node, nd);
936 if (w->end <= start)
937 break;
938 n = prev;
939 }
940 return rb_entry(n, struct sp_node, nd);
941 }
942
943 /* Insert a new shared policy into the list. */
944 /* Caller holds sp->lock */
945 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
946 {
947 struct rb_node **p = &sp->root.rb_node;
948 struct rb_node *parent = NULL;
949 struct sp_node *nd;
950
951 while (*p) {
952 parent = *p;
953 nd = rb_entry(parent, struct sp_node, nd);
954 if (new->start < nd->start)
955 p = &(*p)->rb_left;
956 else if (new->end > nd->end)
957 p = &(*p)->rb_right;
958 else
959 BUG();
960 }
961 rb_link_node(&new->nd, parent, p);
962 rb_insert_color(&new->nd, &sp->root);
963 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
964 new->policy ? new->policy->policy : 0);
965 }
966
967 /* Find shared policy intersecting idx */
968 struct mempolicy *
969 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
970 {
971 struct mempolicy *pol = NULL;
972 struct sp_node *sn;
973
974 if (!sp->root.rb_node)
975 return NULL;
976 spin_lock(&sp->lock);
977 sn = sp_lookup(sp, idx, idx+1);
978 if (sn) {
979 mpol_get(sn->policy);
980 pol = sn->policy;
981 }
982 spin_unlock(&sp->lock);
983 return pol;
984 }
985
986 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
987 {
988 PDprintk("deleting %lx-l%x\n", n->start, n->end);
989 rb_erase(&n->nd, &sp->root);
990 mpol_free(n->policy);
991 kmem_cache_free(sn_cache, n);
992 }
993
994 struct sp_node *
995 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
996 {
997 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
998
999 if (!n)
1000 return NULL;
1001 n->start = start;
1002 n->end = end;
1003 mpol_get(pol);
1004 n->policy = pol;
1005 return n;
1006 }
1007
1008 /* Replace a policy range. */
1009 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1010 unsigned long end, struct sp_node *new)
1011 {
1012 struct sp_node *n, *new2 = NULL;
1013
1014 restart:
1015 spin_lock(&sp->lock);
1016 n = sp_lookup(sp, start, end);
1017 /* Take care of old policies in the same range. */
1018 while (n && n->start < end) {
1019 struct rb_node *next = rb_next(&n->nd);
1020 if (n->start >= start) {
1021 if (n->end <= end)
1022 sp_delete(sp, n);
1023 else
1024 n->start = end;
1025 } else {
1026 /* Old policy spanning whole new range. */
1027 if (n->end > end) {
1028 if (!new2) {
1029 spin_unlock(&sp->lock);
1030 new2 = sp_alloc(end, n->end, n->policy);
1031 if (!new2)
1032 return -ENOMEM;
1033 goto restart;
1034 }
1035 n->end = start;
1036 sp_insert(sp, new2);
1037 new2 = NULL;
1038 break;
1039 } else
1040 n->end = start;
1041 }
1042 if (!next)
1043 break;
1044 n = rb_entry(next, struct sp_node, nd);
1045 }
1046 if (new)
1047 sp_insert(sp, new);
1048 spin_unlock(&sp->lock);
1049 if (new2) {
1050 mpol_free(new2->policy);
1051 kmem_cache_free(sn_cache, new2);
1052 }
1053 return 0;
1054 }
1055
1056 int mpol_set_shared_policy(struct shared_policy *info,
1057 struct vm_area_struct *vma, struct mempolicy *npol)
1058 {
1059 int err;
1060 struct sp_node *new = NULL;
1061 unsigned long sz = vma_pages(vma);
1062
1063 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1064 vma->vm_pgoff,
1065 sz, npol? npol->policy : -1,
1066 npol ? npol->v.nodes[0] : -1);
1067
1068 if (npol) {
1069 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1070 if (!new)
1071 return -ENOMEM;
1072 }
1073 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1074 if (err && new)
1075 kmem_cache_free(sn_cache, new);
1076 return err;
1077 }
1078
1079 /* Free a backing policy store on inode delete. */
1080 void mpol_free_shared_policy(struct shared_policy *p)
1081 {
1082 struct sp_node *n;
1083 struct rb_node *next;
1084
1085 if (!p->root.rb_node)
1086 return;
1087 spin_lock(&p->lock);
1088 next = rb_first(&p->root);
1089 while (next) {
1090 n = rb_entry(next, struct sp_node, nd);
1091 next = rb_next(&n->nd);
1092 mpol_free(n->policy);
1093 kmem_cache_free(sn_cache, n);
1094 }
1095 spin_unlock(&p->lock);
1096 p->root = RB_ROOT;
1097 }
1098
1099 /* assumes fs == KERNEL_DS */
1100 void __init numa_policy_init(void)
1101 {
1102 policy_cache = kmem_cache_create("numa_policy",
1103 sizeof(struct mempolicy),
1104 0, SLAB_PANIC, NULL, NULL);
1105
1106 sn_cache = kmem_cache_create("shared_policy_node",
1107 sizeof(struct sp_node),
1108 0, SLAB_PANIC, NULL, NULL);
1109
1110 /* Set interleaving policy for system init. This way not all
1111 the data structures allocated at system boot end up in node zero. */
1112
1113 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1114 MAX_NUMNODES) < 0)
1115 printk("numa_policy_init: interleaving failed\n");
1116 }
1117
1118 /* Reset policy of current process to default.
1119 * Assumes fs == KERNEL_DS */
1120 void numa_default_policy(void)
1121 {
1122 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1123 }
1124
|
This page was automatically generated by the
LXR engine.
|