Diff markup
1 /* 1 /*
2 * linux/mm/swap_state.c 2 * linux/mm/swap_state.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linu 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 * Swap reorganised 29.12.95, Stephen Tweedie 5 * Swap reorganised 29.12.95, Stephen Tweedie
6 * 6 *
7 * Rewritten to use page cache, (C) 1998 Step 7 * Rewritten to use page cache, (C) 1998 Stephen Tweedie
8 */ 8 */
9 #include <linux/module.h> 9 #include <linux/module.h>
10 #include <linux/mm.h> 10 #include <linux/mm.h>
11 #include <linux/kernel_stat.h> 11 #include <linux/kernel_stat.h>
12 #include <linux/swap.h> 12 #include <linux/swap.h>
13 #include <linux/swapops.h> 13 #include <linux/swapops.h>
14 #include <linux/init.h> 14 #include <linux/init.h>
15 #include <linux/pagemap.h> 15 #include <linux/pagemap.h>
16 #include <linux/buffer_head.h> 16 #include <linux/buffer_head.h>
17 #include <linux/backing-dev.h> 17 #include <linux/backing-dev.h>
18 #include <linux/pagevec.h> 18 #include <linux/pagevec.h>
19 #include <linux/migrate.h> 19 #include <linux/migrate.h>
20 20
21 #include <asm/pgtable.h> 21 #include <asm/pgtable.h>
22 22
23 /* 23 /*
24 * swapper_space is a fiction, retained to sim 24 * swapper_space is a fiction, retained to simplify the path through
25 * vmscan's shrink_page_list, to make sync_pag 25 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
26 * future use of radix_tree tags in the swap c 26 * future use of radix_tree tags in the swap cache.
27 */ 27 */
28 static const struct address_space_operations s 28 static const struct address_space_operations swap_aops = {
29 .writepage = swap_writepage, 29 .writepage = swap_writepage,
30 .sync_page = block_sync_page, 30 .sync_page = block_sync_page,
31 .set_page_dirty = __set_page_dirty_nob 31 .set_page_dirty = __set_page_dirty_nobuffers,
32 .migratepage = migrate_page, 32 .migratepage = migrate_page,
33 }; 33 };
34 34
35 static struct backing_dev_info swap_backing_de 35 static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_DIRT 36 .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
37 .unplug_io_fn = swap_unplug_io_fn, 37 .unplug_io_fn = swap_unplug_io_fn,
38 }; 38 };
39 39
40 struct address_space swapper_space = { 40 struct address_space swapper_space = {
41 .page_tree = RADIX_TREE_INIT(GFP_ 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .tree_lock = __RW_LOCK_UNLOCKED(s <<
43 .a_ops = &swap_aops, 42 .a_ops = &swap_aops,
44 .i_mmap_nonlinear = LIST_HEAD_INIT(swa 43 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
45 .backing_dev_info = &swap_backing_dev_ 44 .backing_dev_info = &swap_backing_dev_info,
46 }; 45 };
47 46
48 #define INC_CACHE_INFO(x) do { swap_cach 47 #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
49 48
50 static struct { 49 static struct {
51 unsigned long add_total; 50 unsigned long add_total;
52 unsigned long del_total; 51 unsigned long del_total;
53 unsigned long find_success; 52 unsigned long find_success;
54 unsigned long find_total; 53 unsigned long find_total;
55 } swap_cache_info; 54 } swap_cache_info;
56 55
57 void show_swap_cache_info(void) 56 void show_swap_cache_info(void)
58 { 57 {
59 printk("Swap cache: add %lu, delete %l 58 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swa 59 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, 60 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %lukB\n", nr_swap 61 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
63 printk("Total swap = %lukB\n", total_s 62 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64 } 63 }
65 64
66 /* 65 /*
67 * add_to_swap_cache resembles add_to_page_cac 66 * add_to_swap_cache resembles add_to_page_cache on swapper_space,
68 * but sets SwapCache flag and private instead 67 * but sets SwapCache flag and private instead of mapping and index.
69 */ 68 */
70 int add_to_swap_cache(struct page *page, swp_e 69 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
71 { 70 {
72 int error; 71 int error;
73 72
74 BUG_ON(!PageLocked(page)); 73 BUG_ON(!PageLocked(page));
75 BUG_ON(PageSwapCache(page)); 74 BUG_ON(PageSwapCache(page));
76 BUG_ON(PagePrivate(page)); 75 BUG_ON(PagePrivate(page));
77 error = radix_tree_preload(gfp_mask); 76 error = radix_tree_preload(gfp_mask);
78 if (!error) { 77 if (!error) {
79 write_lock_irq(&swapper_space. !! 78 DEFINE_RADIX_TREE_CONTEXT(ctx, &swapper_space.page_tree);
80 error = radix_tree_insert(&swa !! 79
81 !! 80 lock_page_ref_irq(page);
>> 81 radix_tree_lock(&ctx);
>> 82 error = radix_tree_insert(ctx.tree, entry.val, page);
>> 83 radix_tree_unlock(&ctx);
82 if (!error) { 84 if (!error) {
83 page_cache_get(page); 85 page_cache_get(page);
84 SetPageSwapCache(page) 86 SetPageSwapCache(page);
85 set_page_private(page, 87 set_page_private(page, entry.val);
86 total_swapcache_pages+ !! 88 mapping_nrpages_inc(&swapper_space);
87 __inc_zone_page_state( 89 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_tot 90 INC_CACHE_INFO(add_total);
89 } 91 }
90 write_unlock_irq(&swapper_spac !! 92 unlock_page_ref_irq(page);
91 radix_tree_preload_end(); 93 radix_tree_preload_end();
92 } 94 }
93 return error; 95 return error;
94 } 96 }
95 97
96 /* 98 /*
97 * This must be called only on pages that have 99 * This must be called only on pages that have
98 * been verified to be in the swap cache. 100 * been verified to be in the swap cache.
99 */ 101 */
100 void __delete_from_swap_cache(struct page *pag 102 void __delete_from_swap_cache(struct page *page)
101 { 103 {
>> 104 DEFINE_RADIX_TREE_CONTEXT(ctx, &swapper_space.page_tree);
>> 105
102 BUG_ON(!PageLocked(page)); 106 BUG_ON(!PageLocked(page));
103 BUG_ON(!PageSwapCache(page)); 107 BUG_ON(!PageSwapCache(page));
104 BUG_ON(PageWriteback(page)); 108 BUG_ON(PageWriteback(page));
105 BUG_ON(PagePrivate(page)); 109 BUG_ON(PagePrivate(page));
106 110
107 radix_tree_delete(&swapper_space.page_ !! 111 radix_tree_lock(&ctx);
>> 112 radix_tree_delete(ctx.tree, page_private(page));
>> 113 radix_tree_unlock(&ctx);
108 set_page_private(page, 0); 114 set_page_private(page, 0);
109 ClearPageSwapCache(page); 115 ClearPageSwapCache(page);
110 total_swapcache_pages--; !! 116 mapping_nrpages_dec(&swapper_space);
111 __dec_zone_page_state(page, NR_FILE_PA 117 __dec_zone_page_state(page, NR_FILE_PAGES);
112 INC_CACHE_INFO(del_total); 118 INC_CACHE_INFO(del_total);
113 } 119 }
114 120
115 /** 121 /**
116 * add_to_swap - allocate swap space for a pag 122 * add_to_swap - allocate swap space for a page
117 * @page: page we want to move to swap 123 * @page: page we want to move to swap
118 * @gfp_mask: memory allocation flags 124 * @gfp_mask: memory allocation flags
119 * 125 *
120 * Allocate swap space for the page and add th 126 * Allocate swap space for the page and add the page to the
121 * swap cache. Caller needs to hold the page 127 * swap cache. Caller needs to hold the page lock.
122 */ 128 */
123 int add_to_swap(struct page * page, gfp_t gfp_ 129 int add_to_swap(struct page * page, gfp_t gfp_mask)
124 { 130 {
125 swp_entry_t entry; 131 swp_entry_t entry;
126 int err; 132 int err;
127 133
128 BUG_ON(!PageLocked(page)); 134 BUG_ON(!PageLocked(page));
129 BUG_ON(!PageUptodate(page)); 135 BUG_ON(!PageUptodate(page));
130 136
131 for (;;) { 137 for (;;) {
132 entry = get_swap_page(); 138 entry = get_swap_page();
133 if (!entry.val) 139 if (!entry.val)
134 return 0; 140 return 0;
135 141
136 /* 142 /*
137 * Radix-tree node allocations 143 * Radix-tree node allocations from PF_MEMALLOC contexts could
138 * completely exhaust the page 144 * completely exhaust the page allocator. __GFP_NOMEMALLOC
139 * stops emergency reserves fr 145 * stops emergency reserves from being allocated.
140 * 146 *
141 * TODO: this could cause a th 147 * TODO: this could cause a theoretical memory reclaim
142 * deadlock in the swap out pa 148 * deadlock in the swap out path.
143 */ 149 */
144 /* 150 /*
145 * Add it to the swap cache an 151 * Add it to the swap cache and mark it dirty
146 */ 152 */
147 err = add_to_swap_cache(page, 153 err = add_to_swap_cache(page, entry,
148 gfp_mask|__GFP 154 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
149 155
150 switch (err) { 156 switch (err) {
151 case 0: 157 case 0: /* Success */
152 SetPageDirty(page); 158 SetPageDirty(page);
153 return 1; 159 return 1;
154 case -EEXIST: 160 case -EEXIST:
155 /* Raced with "specula 161 /* Raced with "speculative" read_swap_cache_async */
156 swap_free(entry); 162 swap_free(entry);
157 continue; 163 continue;
158 default: 164 default:
159 /* -ENOMEM radix-tree 165 /* -ENOMEM radix-tree allocation failure */
160 swap_free(entry); 166 swap_free(entry);
161 return 0; 167 return 0;
162 } 168 }
163 } 169 }
164 } 170 }
165 171
166 /* 172 /*
167 * This must be called only on pages that have 173 * This must be called only on pages that have
168 * been verified to be in the swap cache and l 174 * been verified to be in the swap cache and locked.
169 * It will never put the page into the free li 175 * It will never put the page into the free list,
170 * the caller has a reference on the page. 176 * the caller has a reference on the page.
171 */ 177 */
172 void delete_from_swap_cache(struct page *page) 178 void delete_from_swap_cache(struct page *page)
173 { 179 {
174 swp_entry_t entry; 180 swp_entry_t entry;
175 181
176 entry.val = page_private(page); 182 entry.val = page_private(page);
177 183
178 write_lock_irq(&swapper_space.tree_loc !! 184 lock_page_ref_irq(page);
179 __delete_from_swap_cache(page); 185 __delete_from_swap_cache(page);
180 write_unlock_irq(&swapper_space.tree_l !! 186 unlock_page_ref_irq(page);
181 187
182 swap_free(entry); 188 swap_free(entry);
183 page_cache_release(page); 189 page_cache_release(page);
184 } 190 }
185 191
186 /* 192 /*
187 * If we are the only user, then try to free u 193 * If we are the only user, then try to free up the swap cache.
188 * 194 *
189 * Its ok to check for PageSwapCache without t 195 * Its ok to check for PageSwapCache without the page lock
190 * here because we are going to recheck again 196 * here because we are going to recheck again inside
191 * exclusive_swap_page() _with_ the lock. 197 * exclusive_swap_page() _with_ the lock.
192 * - Marc 198 * - Marcelo
193 */ 199 */
194 static inline void free_swap_cache(struct page 200 static inline void free_swap_cache(struct page *page)
195 { 201 {
196 if (PageSwapCache(page) && !TestSetPag 202 if (PageSwapCache(page) && !TestSetPageLocked(page)) {
197 remove_exclusive_swap_page(pag 203 remove_exclusive_swap_page(page);
198 unlock_page(page); 204 unlock_page(page);
199 } 205 }
200 } 206 }
201 207
202 /* 208 /*
203 * Perform a free_page(), also freeing any swa 209 * Perform a free_page(), also freeing any swap cache associated with
204 * this page if it is the last user of the pag 210 * this page if it is the last user of the page.
205 */ 211 */
206 void free_page_and_swap_cache(struct page *pag 212 void free_page_and_swap_cache(struct page *page)
207 { 213 {
208 free_swap_cache(page); 214 free_swap_cache(page);
209 page_cache_release(page); 215 page_cache_release(page);
210 } 216 }
211 217
212 /* 218 /*
213 * Passed an array of pages, drop them all fro 219 * Passed an array of pages, drop them all from swapcache and then release
214 * them. They are removed from the LRU and fr 220 * them. They are removed from the LRU and freed if this is their last use.
215 */ 221 */
216 void free_pages_and_swap_cache(struct page **p 222 void free_pages_and_swap_cache(struct page **pages, int nr)
217 { 223 {
218 struct page **pagep = pages; 224 struct page **pagep = pages;
219 225
220 lru_add_drain(); 226 lru_add_drain();
221 while (nr) { 227 while (nr) {
222 int todo = min(nr, PAGEVEC_SIZ 228 int todo = min(nr, PAGEVEC_SIZE);
223 int i; 229 int i;
224 230
225 for (i = 0; i < todo; i++) 231 for (i = 0; i < todo; i++)
226 free_swap_cache(pagep[ 232 free_swap_cache(pagep[i]);
227 release_pages(pagep, todo, 0); 233 release_pages(pagep, todo, 0);
228 pagep += todo; 234 pagep += todo;
229 nr -= todo; 235 nr -= todo;
230 } 236 }
231 } 237 }
232 238
233 /* 239 /*
234 * Lookup a swap entry in the swap cache. A fo 240 * Lookup a swap entry in the swap cache. A found page will be returned
235 * unlocked and with its refcount incremented 241 * unlocked and with its refcount incremented - we rely on the kernel
236 * lock getting page table operations atomic e 242 * lock getting page table operations atomic even if we drop the page
237 * lock before returning. 243 * lock before returning.
238 */ 244 */
239 struct page * lookup_swap_cache(swp_entry_t en 245 struct page * lookup_swap_cache(swp_entry_t entry)
240 { 246 {
241 struct page *page; 247 struct page *page;
242 248
243 page = find_get_page(&swapper_space, e 249 page = find_get_page(&swapper_space, entry.val);
244 250
245 if (page) 251 if (page)
246 INC_CACHE_INFO(find_success); 252 INC_CACHE_INFO(find_success);
247 253
248 INC_CACHE_INFO(find_total); 254 INC_CACHE_INFO(find_total);
249 return page; 255 return page;
250 } 256 }
251 257
252 /* 258 /*
253 * Locate a page of swap in physical memory, r 259 * Locate a page of swap in physical memory, reserving swap cache space
254 * and reading the disk if it is not already c 260 * and reading the disk if it is not already cached.
255 * A failure return means that either the page 261 * A failure return means that either the page allocation failed or that
256 * the swap entry is no longer in use. 262 * the swap entry is no longer in use.
257 */ 263 */
258 struct page *read_swap_cache_async(swp_entry_t 264 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
259 struct vm_area_struct 265 struct vm_area_struct *vma, unsigned long addr)
260 { 266 {
261 struct page *found_page, *new_page = N 267 struct page *found_page, *new_page = NULL;
262 int err; 268 int err;
263 269
264 do { 270 do {
265 /* 271 /*
266 * First check the swap cache. 272 * First check the swap cache. Since this is normally
267 * called after lookup_swap_ca 273 * called after lookup_swap_cache() failed, re-calling
268 * that would confuse statisti 274 * that would confuse statistics.
269 */ 275 */
270 found_page = find_get_page(&sw 276 found_page = find_get_page(&swapper_space, entry.val);
271 if (found_page) 277 if (found_page)
272 break; 278 break;
273 279
274 /* 280 /*
275 * Get a new page to read into 281 * Get a new page to read into from swap.
276 */ 282 */
277 if (!new_page) { 283 if (!new_page) {
278 new_page = alloc_page_ 284 new_page = alloc_page_vma(gfp_mask, vma, addr);
279 if (!new_page) 285 if (!new_page)
280 break; 286 break; /* Out of memory */
281 } 287 }
282 288
283 /* 289 /*
284 * Swap entry may have been fr 290 * Swap entry may have been freed since our caller observed it.
285 */ 291 */
286 if (!swap_duplicate(entry)) 292 if (!swap_duplicate(entry))
287 break; 293 break;
288 294
289 /* 295 /*
290 * Associate the page with swa 296 * Associate the page with swap entry in the swap cache.
291 * May fail (-EEXIST) if there 297 * May fail (-EEXIST) if there is already a page associated
292 * with this entry in the swap 298 * with this entry in the swap cache: added by a racing
293 * read_swap_cache_async, or a 299 * read_swap_cache_async, or add_to_swap or shmem_writepage
294 * re-using the just freed swa 300 * re-using the just freed swap entry for an existing page.
295 * May fail (-ENOMEM) if radix 301 * May fail (-ENOMEM) if radix-tree node allocation failed.
296 */ 302 */
297 SetPageLocked(new_page); 303 SetPageLocked(new_page);
298 err = add_to_swap_cache(new_pa 304 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
299 if (!err) { 305 if (!err) {
300 /* 306 /*
301 * Initiate read into 307 * Initiate read into locked page and return.
302 */ 308 */
303 lru_cache_add_active(n 309 lru_cache_add_active(new_page);
304 swap_readpage(NULL, ne 310 swap_readpage(NULL, new_page);
305 return new_page; 311 return new_page;
306 } 312 }
307 ClearPageLocked(new_page); 313 ClearPageLocked(new_page);
308 swap_free(entry); 314 swap_free(entry);
309 } while (err != -ENOMEM); 315 } while (err != -ENOMEM);
310 316
311 if (new_page) 317 if (new_page)
312 page_cache_release(new_page); 318 page_cache_release(new_page);
313 return found_page; 319 return found_page;
314 } 320 }
315 321
316 /** 322 /**
317 * swapin_readahead - swap in pages in hope we 323 * swapin_readahead - swap in pages in hope we need them soon
318 * @entry: swap entry of this memory 324 * @entry: swap entry of this memory
319 * @gfp_mask: memory allocation flags 325 * @gfp_mask: memory allocation flags
320 * @vma: user vma this address belongs to 326 * @vma: user vma this address belongs to
321 * @addr: target address for mempolicy 327 * @addr: target address for mempolicy
322 * 328 *
323 * Returns the struct page for entry and addr, 329 * Returns the struct page for entry and addr, after queueing swapin.
324 * 330 *
325 * Primitive swap readahead code. We simply re 331 * Primitive swap readahead code. We simply read an aligned block of
326 * (1 << page_cluster) entries in the swap are 332 * (1 << page_cluster) entries in the swap area. This method is chosen
327 * because it doesn't cost us any seek time. 333 * because it doesn't cost us any seek time. We also make sure to queue
328 * the 'original' request together with the re 334 * the 'original' request together with the readahead ones...
329 * 335 *
330 * This has been extended to use the NUMA poli 336 * This has been extended to use the NUMA policies from the mm triggering
331 * the readahead. 337 * the readahead.
332 * 338 *
333 * Caller must hold down_read on the vma->vm_m 339 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
334 */ 340 */
335 struct page *swapin_readahead(swp_entry_t entr 341 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
336 struct vm_area_struct 342 struct vm_area_struct *vma, unsigned long addr)
337 { 343 {
338 int nr_pages; 344 int nr_pages;
339 struct page *page; 345 struct page *page;
340 unsigned long offset; 346 unsigned long offset;
341 unsigned long end_offset; 347 unsigned long end_offset;
342 348
343 /* 349 /*
344 * Get starting offset for readaround, 350 * Get starting offset for readaround, and number of pages to read.
345 * Adjust starting address by readbehi 351 * Adjust starting address by readbehind (for NUMA interleave case)?
346 * No, it's very unlikely that swap la 352 * No, it's very unlikely that swap layout would follow vma layout,
347 * more likely that neighbouring swap 353 * more likely that neighbouring swap pages came from the same node:
348 * so use the same "addr" to choose th 354 * so use the same "addr" to choose the same node for each swap read.
349 */ 355 */
350 nr_pages = valid_swaphandles(entry, &o 356 nr_pages = valid_swaphandles(entry, &offset);
351 for (end_offset = offset + nr_pages; o 357 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
352 /* Ok, do the async read-ahead 358 /* Ok, do the async read-ahead now */
353 page = read_swap_cache_async(s 359 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
354 360 gfp_mask, vma, addr);
355 if (!page) 361 if (!page)
356 break; 362 break;
357 page_cache_release(page); 363 page_cache_release(page);
358 } 364 }
359 lru_add_drain(); /* Push any ne 365 lru_add_drain(); /* Push any new pages onto the LRU now */
360 return read_swap_cache_async(entry, gf 366 return read_swap_cache_async(entry, gfp_mask, vma, addr);
361 } 367 }
362 368
|
This page was automatically generated by the
LXR engine.
|