Linux kernel & device driver programming

Cross-Referenced Linux and Device Driver Code

[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]
Version: [ 2.6.11.8 ] [ 2.6.25 ] [ 2.6.25.8 ] [ 2.6.31.13 ] Architecture: [ i386 ]
  1 /*
  2  *      linux/mm/filemap.c
  3  *
  4  * Copyright (C) 1994-1999  Linus Torvalds
  5  */
  6 
  7 /*
  8  * This file handles the generic file mmap semantics used by
  9  * most "normal" filesystems (but you don't /have/ to use this:
 10  * the NFS filesystem used to do this differently, for example)
 11  */
 12 #include <linux/module.h>
 13 #include <linux/slab.h>
 14 #include <linux/compiler.h>
 15 #include <linux/fs.h>
 16 #include <linux/uaccess.h>
 17 #include <linux/aio.h>
 18 #include <linux/capability.h>
 19 #include <linux/kernel_stat.h>
 20 #include <linux/mm.h>
 21 #include <linux/swap.h>
 22 #include <linux/mman.h>
 23 #include <linux/pagemap.h>
 24 #include <linux/file.h>
 25 #include <linux/uio.h>
 26 #include <linux/hash.h>
 27 #include <linux/writeback.h>
 28 #include <linux/backing-dev.h>
 29 #include <linux/pagevec.h>
 30 #include <linux/blkdev.h>
 31 #include <linux/security.h>
 32 #include <linux/syscalls.h>
 33 #include <linux/cpuset.h>
 34 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 35 #include <linux/memcontrol.h>
 36 #include "internal.h"
 37 
 38 /*
 39  * FIXME: remove all knowledge of the buffer layer from the core VM
 40  */
 41 #include <linux/buffer_head.h> /* for generic_osync_inode */
 42 
 43 #include <asm/mman.h>
 44 
 45 static ssize_t
 46 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 47         loff_t offset, unsigned long nr_segs);
 48 
 49 /*
 50  * Shared mappings implemented 30.11.1994. It's not fully working yet,
 51  * though.
 52  *
 53  * Shared mappings now work. 15.8.1995  Bruno.
 54  *
 55  * finished 'unifying' the page and buffer cache and SMP-threaded the
 56  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 57  *
 58  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 59  */
 60 
 61 /*
 62  * Lock ordering:
 63  *
 64  *  ->i_mmap_lock               (vmtruncate)
 65  *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
 66  *      ->swap_lock             (exclusive_swap_page, others)
 67  *        ->mapping->tree_lock
 68  *
 69  *  ->i_mutex
 70  *    ->i_mmap_lock             (truncate->unmap_mapping_range)
 71  *
 72  *  ->mmap_sem
 73  *    ->i_mmap_lock
 74  *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
 75  *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
 76  *
 77  *  ->mmap_sem
 78  *    ->lock_page               (access_process_vm)
 79  *
 80  *  ->i_mutex                   (generic_file_buffered_write)
 81  *    ->mmap_sem                (fault_in_pages_readable->do_page_fault)
 82  *
 83  *  ->i_mutex
 84  *    ->i_alloc_sem             (various)
 85  *
 86  *  ->inode_lock
 87  *    ->sb_lock                 (fs/fs-writeback.c)
 88  *    ->mapping->tree_lock      (__sync_single_inode)
 89  *
 90  *  ->i_mmap_lock
 91  *    ->anon_vma.lock           (vma_adjust)
 92  *
 93  *  ->anon_vma.lock
 94  *    ->page_table_lock or pte_lock     (anon_vma_prepare and various)
 95  *
 96  *  ->page_table_lock or pte_lock
 97  *    ->swap_lock               (try_to_unmap_one)
 98  *    ->private_lock            (try_to_unmap_one)
 99  *    ->tree_lock               (try_to_unmap_one)
100  *    ->zone.lru_lock           (follow_page->mark_page_accessed)
101  *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
102  *    ->private_lock            (page_remove_rmap->set_page_dirty)
103  *    ->tree_lock               (page_remove_rmap->set_page_dirty)
104  *    ->inode_lock              (page_remove_rmap->set_page_dirty)
105  *    ->inode_lock              (zap_pte_range->set_page_dirty)
106  *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
107  *
108  *  ->task->proc_lock
109  *    ->dcache_lock             (proc_pid_lookup)
110  */
111 
112 /*
113  * Remove a page from the page cache and free it. Caller has to make
114  * sure the page is locked and that nobody else uses it - or that usage
115  * is safe.  The caller must hold the mapping's tree_lock.
116  */
117 void __remove_from_page_cache(struct page *page)
118 {
119         struct address_space *mapping = page->mapping;
120         DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree);
121 
122         mem_cgroup_uncharge_page(page);
123 
124         radix_tree_lock(&ctx);
125         radix_tree_delete(ctx.tree, page->index);
126         radix_tree_unlock(&ctx);
127 
128         page->mapping = NULL;
129         mapping_nrpages_dec(mapping);
130         __dec_zone_page_state(page, NR_FILE_PAGES);
131         BUG_ON(page_mapped(page));
132 
133         /*
134          * Some filesystems seem to re-dirty the page even after
135          * the VM has canceled the dirty bit (eg ext3 journaling).
136          *
137          * Fix it up by doing a final dirty accounting check after
138          * having removed the page entirely.
139          */
140         if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
141                 dec_zone_page_state(page, NR_FILE_DIRTY);
142                 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
143         }
144 }
145 
146 void remove_from_page_cache(struct page *page)
147 {
148         BUG_ON(!PageLocked(page));
149 
150         lock_page_ref_irq(page);
151         __remove_from_page_cache(page);
152         unlock_page_ref_irq(page);
153 }
154 
155 static int sync_page(void *word)
156 {
157         struct address_space *mapping;
158         struct page *page;
159 
160         page = container_of((unsigned long *)word, struct page, flags);
161 
162         /*
163          * page_mapping() is being called without PG_locked held.
164          * Some knowledge of the state and use of the page is used to
165          * reduce the requirements down to a memory barrier.
166          * The danger here is of a stale page_mapping() return value
167          * indicating a struct address_space different from the one it's
168          * associated with when it is associated with one.
169          * After smp_mb(), it's either the correct page_mapping() for
170          * the page, or an old page_mapping() and the page's own
171          * page_mapping() has gone NULL.
172          * The ->sync_page() address_space operation must tolerate
173          * page_mapping() going NULL. By an amazing coincidence,
174          * this comes about because none of the users of the page
175          * in the ->sync_page() methods make essential use of the
176          * page_mapping(), merely passing the page down to the backing
177          * device's unplug functions when it's non-NULL, which in turn
178          * ignore it for all cases but swap, where only page_private(page) is
179          * of interest. When page_mapping() does go NULL, the entire
180          * call stack gracefully ignores the page and returns.
181          * -- wli
182          */
183         smp_mb();
184         mapping = page_mapping(page);
185         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
186                 mapping->a_ops->sync_page(page);
187         io_schedule();
188         return 0;
189 }
190 
191 static int sync_page_killable(void *word)
192 {
193         sync_page(word);
194         return fatal_signal_pending(current) ? -EINTR : 0;
195 }
196 
197 /**
198  * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
199  * @mapping:    address space structure to write
200  * @start:      offset in bytes where the range starts
201  * @end:        offset in bytes where the range ends (inclusive)
202  * @sync_mode:  enable synchronous operation
203  *
204  * Start writeback against all of a mapping's dirty pages that lie
205  * within the byte offsets <start, end> inclusive.
206  *
207  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
208  * opposed to a regular memory cleansing writeback.  The difference between
209  * these two operations is that if a dirty page/buffer is encountered, it must
210  * be waited upon, and not just skipped over.
211  */
212 int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
213                                 loff_t end, int sync_mode)
214 {
215         int ret;
216         struct writeback_control wbc = {
217                 .sync_mode = sync_mode,
218                 .nr_to_write = mapping_nrpages(mapping) * 2,
219                 .range_start = start,
220                 .range_end = end,
221         };
222 
223         if (!mapping_cap_writeback_dirty(mapping))
224                 return 0;
225 
226         ret = do_writepages(mapping, &wbc);
227         return ret;
228 }
229 
230 static inline int __filemap_fdatawrite(struct address_space *mapping,
231         int sync_mode)
232 {
233         return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
234 }
235 
236 int filemap_fdatawrite(struct address_space *mapping)
237 {
238         return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
239 }
240 EXPORT_SYMBOL(filemap_fdatawrite);
241 
242 static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
243                                 loff_t end)
244 {
245         return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
246 }
247 
248 /**
249  * filemap_flush - mostly a non-blocking flush
250  * @mapping:    target address_space
251  *
252  * This is a mostly non-blocking flush.  Not suitable for data-integrity
253  * purposes - I/O may not be started against all dirty pages.
254  */
255 int filemap_flush(struct address_space *mapping)
256 {
257         return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
258 }
259 EXPORT_SYMBOL(filemap_flush);
260 
261 /**
262  * wait_on_page_writeback_range - wait for writeback to complete
263  * @mapping:    target address_space
264  * @start:      beginning page index
265  * @end:        ending page index
266  *
267  * Wait for writeback to complete against pages indexed by start->end
268  * inclusive
269  */
270 int wait_on_page_writeback_range(struct address_space *mapping,
271                                 pgoff_t start, pgoff_t end)
272 {
273         struct pagevec pvec;
274         int nr_pages;
275         int ret = 0;
276         pgoff_t index;
277 
278         if (end < start)
279                 return 0;
280 
281         pagevec_init(&pvec, 0);
282         index = start;
283         while ((index <= end) &&
284                         (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
285                         PAGECACHE_TAG_WRITEBACK,
286                         min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
287                 unsigned i;
288 
289                 for (i = 0; i < nr_pages; i++) {
290                         struct page *page = pvec.pages[i];
291 
292                         /* until radix tree lookup accepts end_index */
293                         if (page->index > end)
294                                 continue;
295 
296                         wait_on_page_writeback(page);
297                         if (PageError(page))
298                                 ret = -EIO;
299                 }
300                 pagevec_release(&pvec);
301                 cond_resched();
302         }
303 
304         /* Check for outstanding write errors */
305         if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
306                 ret = -ENOSPC;
307         if (test_and_clear_bit(AS_EIO, &mapping->flags))
308                 ret = -EIO;
309 
310         return ret;
311 }
312 
313 /**
314  * sync_page_range - write and wait on all pages in the passed range
315  * @inode:      target inode
316  * @mapping:    target address_space
317  * @pos:        beginning offset in pages to write
318  * @count:      number of bytes to write
319  *
320  * Write and wait upon all the pages in the passed range.  This is a "data
321  * integrity" operation.  It waits upon in-flight writeout before starting and
322  * waiting upon new writeout.  If there was an IO error, return it.
323  *
324  * We need to re-take i_mutex during the generic_osync_inode list walk because
325  * it is otherwise livelockable.
326  */
327 int sync_page_range(struct inode *inode, struct address_space *mapping,
328                         loff_t pos, loff_t count)
329 {
330         pgoff_t start = pos >> PAGE_CACHE_SHIFT;
331         pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
332         int ret;
333 
334         if (!mapping_cap_writeback_dirty(mapping) || !count)
335                 return 0;
336         ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
337         if (ret == 0) {
338                 mutex_lock(&inode->i_mutex);
339                 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
340                 mutex_unlock(&inode->i_mutex);
341         }
342         if (ret == 0)
343                 ret = wait_on_page_writeback_range(mapping, start, end);
344         return ret;
345 }
346 EXPORT_SYMBOL(sync_page_range);
347 
348 /**
349  * sync_page_range_nolock - write & wait on all pages in the passed range without locking
350  * @inode:      target inode
351  * @mapping:    target address_space
352  * @pos:        beginning offset in pages to write
353  * @count:      number of bytes to write
354  *
355  * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea
356  * as it forces O_SYNC writers to different parts of the same file
357  * to be serialised right until io completion.
358  */
359 int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
360                            loff_t pos, loff_t count)
361 {
362         pgoff_t start = pos >> PAGE_CACHE_SHIFT;
363         pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
364         int ret;
365 
366         if (!mapping_cap_writeback_dirty(mapping) || !count)
367                 return 0;
368         ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
369         if (ret == 0)
370                 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
371         if (ret == 0)
372                 ret = wait_on_page_writeback_range(mapping, start, end);
373         return ret;
374 }
375 EXPORT_SYMBOL(sync_page_range_nolock);
376 
377 /**
378  * filemap_fdatawait - wait for all under-writeback pages to complete
379  * @mapping: address space structure to wait for
380  *
381  * Walk the list of under-writeback pages of the given address space
382  * and wait for all of them.
383  */
384 int filemap_fdatawait(struct address_space *mapping)
385 {
386         loff_t i_size = i_size_read(mapping->host);
387 
388         if (i_size == 0)
389                 return 0;
390 
391         return wait_on_page_writeback_range(mapping, 0,
392                                 (i_size - 1) >> PAGE_CACHE_SHIFT);
393 }
394 EXPORT_SYMBOL(filemap_fdatawait);
395 
396 int filemap_write_and_wait(struct address_space *mapping)
397 {
398         int err = 0;
399 
400         if (mapping_nrpages(mapping)) {
401                 err = filemap_fdatawrite(mapping);
402                 /*
403                  * Even if the above returned error, the pages may be
404                  * written partially (e.g. -ENOSPC), so we wait for it.
405                  * But the -EIO is special case, it may indicate the worst
406                  * thing (e.g. bug) happened, so we avoid waiting for it.
407                  */
408                 if (err != -EIO) {
409                         int err2 = filemap_fdatawait(mapping);
410                         if (!err)
411                                 err = err2;
412                 }
413         }
414         return err;
415 }
416 EXPORT_SYMBOL(filemap_write_and_wait);
417 
418 /**
419  * filemap_write_and_wait_range - write out & wait on a file range
420  * @mapping:    the address_space for the pages
421  * @lstart:     offset in bytes where the range starts
422  * @lend:       offset in bytes where the range ends (inclusive)
423  *
424  * Write out and wait upon file offsets lstart->lend, inclusive.
425  *
426  * Note that `lend' is inclusive (describes the last byte to be written) so
427  * that this function can be used to write to the very end-of-file (end = -1).
428  */
429 int filemap_write_and_wait_range(struct address_space *mapping,
430                                  loff_t lstart, loff_t lend)
431 {
432         int err = 0;
433 
434         if (mapping_nrpages(mapping)) {
435                 err = __filemap_fdatawrite_range(mapping, lstart, lend,
436                                                  WB_SYNC_ALL);
437                 /* See comment of filemap_write_and_wait() */
438                 if (err != -EIO) {
439                         int err2 = wait_on_page_writeback_range(mapping,
440                                                 lstart >> PAGE_CACHE_SHIFT,
441                                                 lend >> PAGE_CACHE_SHIFT);
442                         if (!err)
443                                 err = err2;
444                 }
445         }
446         return err;
447 }
448 
449 /**
450  * add_to_page_cache - add newly allocated pagecache pages
451  * @page:       page to add
452  * @mapping:    the page's address_space
453  * @offset:     page index
454  * @gfp_mask:   page allocation mode
455  *
456  * This function is used to add newly allocated pagecache pages;
457  * the page is new, so we can just run SetPageLocked() against it.
458  * The other page state flags were set by rmqueue().
459  *
460  * This function does not add the page to the LRU.  The caller must do that.
461  */
462 int add_to_page_cache(struct page *page, struct address_space *mapping,
463                 pgoff_t offset, gfp_t gfp_mask)
464 {
465         int error = mem_cgroup_cache_charge(page, current->mm,
466                                         gfp_mask & ~__GFP_HIGHMEM);
467         if (error)
468                 goto out;
469 
470         error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
471         if (error == 0) {
472                 DEFINE_RADIX_TREE_CONTEXT(ctx, &mapping->page_tree);
473 
474                 lock_page_ref_irq(page);
475                 radix_tree_lock(&ctx);
476                 error = radix_tree_insert(ctx.tree, offset, page);
477                 radix_tree_unlock(&ctx);
478                 if (!error) {
479                         page_cache_get(page);
480                         SetPageLocked(page);
481                         page->mapping = mapping;
482                         page->index = offset;
483                         mapping_nrpages_inc(mapping);
484                         __inc_zone_page_state(page, NR_FILE_PAGES);
485                 } else
486                         mem_cgroup_uncharge_page(page);
487 
488                 unlock_page_ref_irq(page);
489                 radix_tree_preload_end();
490         } else
491                 mem_cgroup_uncharge_page(page);
492 out:
493         return error;
494 }
495 EXPORT_SYMBOL(add_to_page_cache);
496 
497 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
498                                 pgoff_t offset, gfp_t gfp_mask)
499 {
500         int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
501         if (ret == 0)
502                 lru_cache_add(page);
503         return ret;
504 }
505 
506 #ifdef CONFIG_NUMA
507 struct page *__page_cache_alloc(gfp_t gfp)
508 {
509         if (cpuset_do_page_mem_spread()) {
510                 int n = cpuset_mem_spread_node();
511                 return alloc_pages_node(n, gfp, 0);
512         }
513         return alloc_pages(gfp, 0);
514 }
515 EXPORT_SYMBOL(__page_cache_alloc);
516 #endif
517 
518 static int __sleep_on_page_lock(void *word)
519 {
520         io_schedule();
521         return 0;
522 }
523 
524 int __sleep_on_page(void *word)
525 {
526         schedule();
527         return 0;
528 }
529 
530 static inline void wake_up_page(struct page *page, int bit)
531 {
532         __wake_up_bit(page_waitqueue(page), &page->flags, bit);
533 }
534 
535 void wait_on_page_bit(struct page *page, int bit_nr)
536 {
537         DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
538 
539         if (test_bit(bit_nr, &page->flags))
540                 __wait_on_bit(page_waitqueue(page), &wait, sync_page,
541                                                         TASK_UNINTERRUPTIBLE);
542 }
543 EXPORT_SYMBOL(wait_on_page_bit);
544 
545 /**
546  * unlock_page - unlock a locked page
547  * @page: the page
548  *
549  * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
550  * Also wakes sleepers in wait_on_page_writeback() because the wakeup
551  * mechananism between PageLocked pages and PageWriteback pages is shared.
552  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
553  *
554  * The first mb is necessary to safely close the critical section opened by the
555  * TestSetPageLocked(), the second mb is necessary to enforce ordering between
556  * the clear_bit and the read of the waitqueue (to avoid SMP races with a
557  * parallel wait_on_page_locked()).
558  */
559 void unlock_page(struct page *page)
560 {
561         smp_mb__before_clear_bit();
562         if (!TestClearPageLocked(page))
563                 BUG();
564         smp_mb__after_clear_bit(); 
565         wake_up_page(page, PG_locked);
566 }
567 EXPORT_SYMBOL(unlock_page);
568 
569 /**
570  * end_page_writeback - end writeback against a page
571  * @page: the page
572  */
573 void end_page_writeback(struct page *page)
574 {
575         if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
576                 if (!test_clear_page_writeback(page))
577                         BUG();
578         }
579         smp_mb__after_clear_bit();
580         wake_up_page(page, PG_writeback);
581 }
582 EXPORT_SYMBOL(end_page_writeback);
583 
584 /**
585  * __lock_page - get a lock on the page, assuming we need to sleep to get it
586  * @page: the page to lock
587  *
588  * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
589  * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
590  * chances are that on the second loop, the block layer's plug list is empty,
591  * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
592  */
593 void __lock_page(struct page *page)
594 {
595         DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
596 
597         __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
598                                                         TASK_UNINTERRUPTIBLE);
599 }
600 EXPORT_SYMBOL(__lock_page);
601 
602 int __lock_page_killable(struct page *page)
603 {
604         DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
605 
606         return __wait_on_bit_lock(page_waitqueue(page), &wait,
607                                         sync_page_killable, TASK_KILLABLE);
608 }
609 
610 /**
611  * __lock_page_nosync - get a lock on the page, without calling sync_page()
612  * @page: the page to lock
613  *
614  * Variant of lock_page that does not require the caller to hold a reference
615  * on the page's mapping.
616  */
617 void __lock_page_nosync(struct page *page)
618 {
619         DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
620         __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
621                                                         TASK_UNINTERRUPTIBLE);
622 }
623 
624 /**
625  * find_get_page - find and get a page reference
626  * @mapping: the address_space to search
627  * @offset: the page index
628  *
629  * Is there a pagecache struct page at the given (mapping, offset) tuple?
630  * If yes, increment its refcount and return it; if no, return NULL.
631  */
632 struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
633 {
634         void **pagep;
635         struct page *page;
636 
637         rcu_read_lock();
638 repeat:
639         page = NULL;
640         pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
641         if (pagep) {
642                 page = radix_tree_deref_slot(pagep);
643                 if (unlikely(!page || page == RADIX_TREE_RETRY))
644                         goto repeat;
645 
646                 if (!page_cache_get_speculative(page))
647                         goto repeat;
648 
649                 /*
650                  * Has the page moved?
651                  * This is part of the lockless pagecache protocol. See
652                  * include/linux/pagemap.h for details.
653                  */
654                 if (unlikely(page != *pagep)) {
655                         page_cache_release(page);
656                         goto repeat;
657                 }
658         }
659         rcu_read_unlock();
660 
661         return page;
662 }
663 EXPORT_SYMBOL(find_get_page);
664 
665 /**
666  * find_lock_page - locate, pin and lock a pagecache page
667  * @mapping: the address_space to search
668  * @offset: the page index
669  *
670  * Locates the desired pagecache page, locks it, increments its reference
671  * count and returns its address.
672  *
673  * Returns zero if the page was not present. find_lock_page() may sleep.
674  */
675 struct page *find_lock_page(struct address_space *mapping,
676                                 pgoff_t offset)
677 {
678         struct page *page;
679 
680 repeat:
681         page = find_get_page(mapping, offset);
682         if (page) {
683                 lock_page(page);
684                 /* Has the page been truncated? */
685                 if (unlikely(page->mapping != mapping)) {
686                         unlock_page(page);
687                         page_cache_release(page);
688                         goto repeat;
689                 }
690         }
691         return page;
692 }
693 EXPORT_SYMBOL(find_lock_page);
694 
695 /**
696  * find_or_create_page - locate or add a pagecache page
697  * @mapping: the page's address_space
698  * @index: the page's index into the mapping
699  * @gfp_mask: page allocation mode
700  *
701  * Locates a page in the pagecache.  If the page is not present, a new page
702  * is allocated using @gfp_mask and is added to the pagecache and to the VM's
703  * LRU list.  The returned page is locked and has its reference count
704  * incremented.
705  *
706  * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic
707  * allocation!
708  *
709  * find_or_create_page() returns the desired page's address, or zero on
710  * memory exhaustion.
711  */
712 struct page *find_or_create_page(struct address_space *mapping,
713                 pgoff_t index, gfp_t gfp_mask)
714 {
715         struct page *page;
716         int err;
717 repeat:
718         page = find_lock_page(mapping, index);
719         if (!page) {
720                 page = __page_cache_alloc(gfp_mask);
721                 if (!page)
722                         return NULL;
723                 err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
724                 if (unlikely(err)) {
725                         page_cache_release(page);
726                         page = NULL;
727                         if (err == -EEXIST)
728                                 goto repeat;
729                 }
730         }
731         return page;
732 }
733 EXPORT_SYMBOL(find_or_create_page);
734 
735 /**
736  * find_get_pages - gang pagecache lookup
737  * @mapping:    The address_space to search
738  * @start:      The starting page index
739  * @nr_pages:   The maximum number of pages
740  * @pages:      Where the resulting pages are placed
741  *
742  * find_get_pages() will search for and return a group of up to
743  * @nr_pages pages in the mapping.  The pages are placed at @pages.
744  * find_get_pages() takes a reference against the returned pages.
745  *
746  * The search returns a group of mapping-contiguous pages with ascending
747  * indexes.  There may be holes in the indices due to not-present pages.
748  *
749  * find_get_pages() returns the number of pages which were found.
750  */
751 unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
752                             unsigned int nr_pages, struct page **pages)
753 {
754         unsigned int i;
755         unsigned int ret;
756         unsigned int nr_found;
757 
758         rcu_read_lock();
759 restart:
760         nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
761                                 (void ***)pages, start, nr_pages);
762         ret = 0;
763         for (i = 0; i < nr_found; i++) {
764                 struct page *page;
765 repeat:
766                 page = radix_tree_deref_slot((void **)pages[i]);
767                 if (unlikely(!page))
768                         continue;
769                 /*
770                  * this can only trigger if nr_found == 1, making livelock
771                  * a non issue.
772                  */
773                 if (unlikely(page == RADIX_TREE_RETRY))
774                         goto restart;
775 
776                 if (!page_cache_get_speculative(page))
777                         goto repeat;
778 
779                 /* Has the page moved? */
780                 if (unlikely(page != *((void **)pages[i]))) {
781                         page_cache_release(page);
782                         goto repeat;
783                 }
784 
785                 pages[ret] = page;
786                 ret++;
787         }
788         rcu_read_unlock();
789         return ret;
790 }
791 
792 /**
793  * find_get_pages_contig - gang contiguous pagecache lookup
794  * @mapping:    The address_space to search
795  * @index:      The starting page index
796  * @nr_pages:   The maximum number of pages
797  * @pages:      Where the resulting pages are placed
798  *
799  * find_get_pages_contig() works exactly like find_get_pages(), except
800  * that the returned number of pages are guaranteed to be contiguous.
801  *
802  * find_get_pages_contig() returns the number of pages which were found.
803  */
804 unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
805                                unsigned int nr_pages, struct page **pages)
806 {
807         unsigned int i;
808         unsigned int ret;
809         unsigned int nr_found;
810 
811         rcu_read_lock();
812 restart:
813         nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
814                                 (void ***)pages, index, nr_pages);
815         ret = 0;
816         for (i = 0; i < nr_found; i++) {
817                 struct page *page;
818 repeat:
819                 page = radix_tree_deref_slot((void **)pages[i]);
820                 if (unlikely(!page))
821                         continue;
822                 /*
823                  * this can only trigger if nr_found == 1, making livelock
824                  * a non issue.
825                  */
826                 if (unlikely(page == RADIX_TREE_RETRY))
827                         goto restart;
828 
829                 if (page->mapping == NULL || page->index != index)
830                         break;
831 
832                 if (!page_cache_get_speculative(page))
833                         goto repeat;
834 
835                 /* Has the page moved? */
836                 if (unlikely(page != *((void **)pages[i]))) {
837                         page_cache_release(page);
838                         goto repeat;
839                 }
840 
841                 pages[ret] = page;
842                 ret++;
843                 index++;
844         }
845         rcu_read_unlock();
846         return ret;
847 }
848 EXPORT_SYMBOL(find_get_pages_contig);
849 
850 /**
851  * find_get_pages_tag - find and return pages that match @tag
852  * @mapping:    the address_space to search
853  * @index:      the starting page index
854  * @tag:        the tag index
855  * @nr_pages:   the maximum number of pages
856  * @pages:      where the resulting pages are placed
857  *
858  * Like find_get_pages, except we only return pages which are tagged with
859  * @tag.   We update @index to index the next page for the traversal.
860  */
861 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
862                         int tag, unsigned int nr_pages, struct page **pages)
863 {
864         unsigned int i;
865         unsigned int ret;
866         unsigned int nr_found;
867 
868         rcu_read_lock();
869 restart:
870         nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
871                                 (void ***)pages, *index, nr_pages, tag);
872         ret = 0;
873         for (i = 0; i < nr_found; i++) {
874                 struct page *page;
875 repeat:
876                 page = radix_tree_deref_slot((void **)pages[i]);
877                 if (unlikely(!page))
878                         continue;
879                 /*
880                  * this can only trigger if nr_found == 1, making livelock
881                  * a non issue.
882                  */
883                 if (unlikely(page == RADIX_TREE_RETRY))
884                         goto restart;
885 
886                 if (!page_cache_get_speculative(page))
887                         goto repeat;
888 
889                 /* Has the page moved? */
890                 if (unlikely(page != *((void **)pages[i]))) {
891                         page_cache_release(page);
892                         goto repeat;
893                 }
894 
895                 pages[ret] = page;
896                 ret++;
897         }
898         rcu_read_unlock();
899 
900         if (ret)
901                 *index = pages[ret - 1]->index + 1;
902 
903         return ret;
904 }
905 EXPORT_SYMBOL(find_get_pages_tag);
906 
907 /**
908  * grab_cache_page_nowait - returns locked page at given index in given cache
909  * @mapping: target address_space
910  * @index: the page index
911  *
912  * Same as grab_cache_page(), but do not wait if the page is unavailable.
913  * This is intended for speculative data generators, where the data can
914  * be regenerated if the page couldn't be grabbed.  This routine should
915  * be safe to call while holding the lock for another page.
916  *
917  * Clear __GFP_FS when allocating the page to avoid recursion into the fs
918  * and deadlock against the caller's locked page.
919  */
920 struct page *
921 grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
922 {
923         struct page *page = find_get_page(mapping, index);
924 
925         if (page) {
926                 if (!TestSetPageLocked(page))
927                         return page;
928                 page_cache_release(page);
929                 return NULL;
930         }
931         page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
932         if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
933                 page_cache_release(page);
934                 page = NULL;
935         }
936         return page;
937 }
938 EXPORT_SYMBOL(grab_cache_page_nowait);
939 
940 /*
941  * CD/DVDs are error prone. When a medium error occurs, the driver may fail
942  * a _large_ part of the i/o request. Imagine the worst scenario:
943  *
944  *      ---R__________________________________________B__________
945  *         ^ reading here                             ^ bad block(assume 4k)
946  *
947  * read(R) => miss => readahead(R...B) => media error => frustrating retries
948  * => failing the whole request => read(R) => read(R+1) =>
949  * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
950  * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
951  * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
952  *
953  * It is going insane. Fix it by quickly scaling down the readahead size.
954  */
955 static void shrink_readahead_size_eio(struct file *filp,
956                                         struct file_ra_state *ra)
957 {
958         if (!ra->ra_pages)
959                 return;
960 
961         ra->ra_pages /= 4;
962 }
963 
964 /**
965  * do_generic_file_read - generic file read routine
966  * @filp:       the file to read
967  * @ppos:       current file position
968  * @desc:       read_descriptor
969  * @actor:      read method
970  *
971  * This is a generic file read routine, and uses the
972  * mapping->a_ops->readpage() function for the actual low-level stuff.
973  *
974  * This is really ugly. But the goto's actually try to clarify some
975  * of the logic when it comes to error handling etc.
976  */
977 static void do_generic_file_read(struct file *filp, loff_t *ppos,
978                 read_descriptor_t *desc, read_actor_t actor)
979 {
980         struct address_space *mapping = filp->f_mapping;
981         struct inode *inode = mapping->host;
982         struct file_ra_state *ra = &filp->f_ra;
983         pgoff_t index;
984         pgoff_t last_index;
985         pgoff_t prev_index;
986         unsigned long offset;      /* offset into pagecache page */
987         unsigned int prev_offset;
988         int error;
989 
990         index = *ppos >> PAGE_CACHE_SHIFT;
991         prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
992         prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
993         last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
994         offset = *ppos & ~PAGE_CACHE_MASK;
995 
996         for (;;) {
997                 struct page *page;
998                 pgoff_t end_index;
999                 loff_t isize;
1000                 unsigned long nr, ret;
1001 
1002                 cond_resched();
1003 find_page:
1004                 page = find_get_page(mapping, index);
1005                 if (!page) {
1006                         page_cache_sync_readahead(mapping,
1007                                         ra, filp,
1008                                         index, last_index - index);
1009                         page = find_get_page(mapping, index);
1010                         if (unlikely(page == NULL))
1011                                 goto no_cached_page;
1012                 }
1013                 if (PageReadahead(page)) {
1014                         page_cache_async_readahead(mapping,
1015                                         ra, filp, page,
1016                                         index, last_index - index);
1017                 }
1018                 if (!PageUptodate(page))
1019                         goto page_not_up_to_date;
1020 page_ok:
1021                 /*
1022                  * i_size must be checked after we know the page is Uptodate.
1023                  *
1024                  * Checking i_size after the check allows us to calculate
1025                  * the correct value for "nr", which means the zero-filled
1026                  * part of the page is not copied back to userspace (unless
1027                  * another truncate extends the file - this is desired though).
1028                  */
1029 
1030                 isize = i_size_read(inode);
1031                 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1032                 if (unlikely(!isize || index > end_index)) {
1033                         page_cache_release(page);
1034                         goto out;
1035                 }
1036 
1037                 /* nr is the maximum number of bytes to copy from this page */
1038                 nr = PAGE_CACHE_SIZE;
1039                 if (index == end_index) {
1040                         nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1041                         if (nr <= offset) {
1042                                 page_cache_release(page);
1043                                 goto out;
1044                         }
1045                 }
1046                 nr = nr - offset;
1047 
1048                 /* If users can be writing to this page using arbitrary
1049                  * virtual addresses, take care about potential aliasing
1050                  * before reading the page on the kernel side.
1051                  */
1052                 if (mapping_writably_mapped(mapping))
1053                         flush_dcache_page(page);
1054 
1055                 /*
1056                  * When a sequential read accesses a page several times,
1057                  * only mark it as accessed the first time.
1058                  */
1059                 if (prev_index != index || offset != prev_offset)
1060                         mark_page_accessed(page);
1061                 prev_index = index;
1062 
1063                 /*
1064                  * Ok, we have the page, and it's up-to-date, so
1065                  * now we can copy it to user space...
1066                  *
1067                  * The actor routine returns how many bytes were actually used..
1068                  * NOTE! This may not be the same as how much of a user buffer
1069                  * we filled up (we may be padding etc), so we can only update
1070                  * "pos" here (the actor routine has to update the user buffer
1071                  * pointers and the remaining count).
1072                  */
1073                 ret = actor(desc, page, offset, nr);
1074                 offset += ret;
1075                 index += offset >> PAGE_CACHE_SHIFT;
1076                 offset &= ~PAGE_CACHE_MASK;
1077                 prev_offset = offset;
1078 
1079                 page_cache_release(page);
1080                 if (ret == nr && desc->count)
1081                         continue;
1082                 goto out;
1083 
1084 page_not_up_to_date:
1085                 /* Get exclusive access to the page ... */
1086                 if (lock_page_killable(page))
1087                         goto readpage_eio;
1088 
1089                 /* Did it get truncated before we got the lock? */
1090                 if (!page->mapping) {
1091                         unlock_page(page);
1092                         page_cache_release(page);
1093                         continue;
1094                 }
1095 
1096                 /* Did somebody else fill it already? */
1097                 if (PageUptodate(page)) {
1098                         unlock_page(page);
1099                         goto page_ok;
1100                 }
1101 
1102 readpage:
1103                 /* Start the actual read. The read will unlock the page. */
1104                 error = mapping->a_ops->readpage(filp, page);
1105 
1106                 if (unlikely(error)) {
1107                         if (error == AOP_TRUNCATED_PAGE) {
1108                                 page_cache_release(page);
1109                                 goto find_page;
1110                         }
1111                         goto readpage_error;
1112                 }
1113 
1114                 if (!PageUptodate(page)) {
1115                         if (lock_page_killable(page))
1116                                 goto readpage_eio;
1117                         if (!PageUptodate(page)) {
1118                                 if (page->mapping == NULL) {
1119                                         /*
1120                                          * invalidate_inode_pages got it
1121                                          */
1122                                         unlock_page(page);
1123                                         page_cache_release(page);
1124                                         goto find_page;
1125                                 }
1126                                 unlock_page(page);
1127                                 shrink_readahead_size_eio(filp, ra);
1128                                 goto readpage_eio;
1129                         }
1130                         unlock_page(page);
1131                 }
1132 
1133                 goto page_ok;
1134 
1135 readpage_eio:
1136                 error = -EIO;
1137 readpage_error:
1138                 /* UHHUH! A synchronous read error occurred. Report it */
1139                 desc->error = error;
1140                 page_cache_release(page);
1141                 goto out;
1142 
1143 no_cached_page:
1144                 /*
1145                  * Ok, it wasn't cached, so we need to create a new
1146                  * page..
1147                  */
1148                 page = page_cache_alloc_cold(mapping);
1149                 if (!page) {
1150                         desc->error = -ENOMEM;
1151                         goto out;
1152                 }
1153                 error = add_to_page_cache_lru(page, mapping,
1154                                                 index, GFP_KERNEL);
1155                 if (error) {
1156                         page_cache_release(page);
1157                         if (error == -EEXIST)
1158                                 goto find_page;
1159                         desc->error = error;
1160                         goto out;
1161                 }
1162                 goto readpage;
1163         }
1164 
1165 out:
1166         ra->prev_pos = prev_index;
1167         ra->prev_pos <<= PAGE_CACHE_SHIFT;
1168         ra->prev_pos |= prev_offset;
1169 
1170         *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1171         if (filp)
1172                 file_accessed(filp);
1173 }
1174 
1175 int file_read_actor(read_descriptor_t *desc, struct page *page,
1176                         unsigned long offset, unsigned long size)
1177 {
1178         char *kaddr;
1179         unsigned long left, count = desc->count;
1180 
1181         if (size > count)
1182                 size = count;
1183 
1184         /*
1185          * Faults on the destination of a read are common, so do it before
1186          * taking the kmap.
1187          */
1188         if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1189                 kaddr = kmap_atomic(page, KM_USER0);
1190                 left = __copy_to_user_inatomic(desc->arg.buf,
1191                                                 kaddr + offset, size);
1192                 kunmap_atomic(kaddr, KM_USER0);
1193                 if (left == 0)
1194                         goto success;
1195         }
1196 
1197         /* Do it the slow way */
1198         kaddr = kmap(page);
1199         left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1200         kunmap(page);
1201 
1202         if (left) {
1203                 size -= left;
1204                 desc->error = -EFAULT;
1205         }
1206 success:
1207         desc->count = count - size;
1208         desc->written += size;
1209         desc->arg.buf += size;
1210         return size;
1211 }
1212 
1213 /*
1214  * Performs necessary checks before doing a write
1215  * @iov:        io vector request
1216  * @nr_segs:    number of segments in the iovec
1217  * @count:      number of bytes to write
1218  * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
1219  *
1220  * Adjust number of segments and amount of bytes to write (nr_segs should be
1221  * properly initialized first). Returns appropriate error code that caller
1222  * should return or zero in case that write should be allowed.
1223  */
1224 int generic_segment_checks(const struct iovec *iov,
1225                         unsigned long *nr_segs, size_t *count, int access_flags)
1226 {
1227         unsigned long   seg;
1228         size_t cnt = 0;
1229         for (seg = 0; seg < *nr_segs; seg++) {
1230                 const struct iovec *iv = &iov[seg];
1231 
1232                 /*
1233                  * If any segment has a negative length, or the cumulative
1234                  * length ever wraps negative then return -EINVAL.
1235                  */
1236                 cnt += iv->iov_len;
1237                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1238                         return -EINVAL;
1239                 if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1240                         continue;
1241                 if (seg == 0)
1242                         return -EFAULT;
1243                 *nr_segs = seg;
1244                 cnt -= iv->iov_len;     /* This segment is no good */
1245                 break;
1246         }
1247         *count = cnt;
1248         return 0;
1249 }
1250 EXPORT_SYMBOL(generic_segment_checks);
1251 
1252 /**
1253  * generic_file_aio_read - generic filesystem read routine
1254  * @iocb:       kernel I/O control block
1255  * @iov:        io vector request
1256  * @nr_segs:    number of segments in the iovec
1257  * @pos:        current file position
1258  *
1259  * This is the "read()" routine for all filesystems
1260  * that can use the page cache directly.
1261  */
1262 ssize_t
1263 generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1264                 unsigned long nr_segs, loff_t pos)
1265 {
1266         struct file *filp = iocb->ki_filp;
1267         ssize_t retval;
1268         unsigned long seg;
1269         size_t count;
1270         loff_t *ppos = &iocb->ki_pos;
1271 
1272         count = 0;
1273         retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
1274         if (retval)
1275                 return retval;
1276 
1277         /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
1278         if (filp->f_flags & O_DIRECT) {
1279                 loff_t size;
1280                 struct address_space *mapping;
1281                 struct inode *inode;
1282 
1283                 mapping = filp->f_mapping;
1284                 inode = mapping->host;
1285                 retval = 0;
1286                 if (!count)
1287                         goto out; /* skip atime */
1288                 size = i_size_read(inode);
1289                 if (pos < size) {
1290                         retval = generic_file_direct_IO(READ, iocb,
1291                                                 iov, pos, nr_segs);
1292                         if (retval > 0)
1293                                 *ppos = pos + retval;
1294                 }
1295                 if (likely(retval != 0)) {
1296                         file_accessed(filp);
1297                         goto out;
1298                 }
1299         }
1300 
1301         retval = 0;
1302         if (count) {
1303                 for (seg = 0; seg < nr_segs; seg++) {
1304                         read_descriptor_t desc;
1305 
1306                         desc.written = 0;
1307                         desc.arg.buf = iov[seg].iov_base;
1308                         desc.count = iov[seg].iov_len;
1309                         if (desc.count == 0)
1310                                 continue;
1311                         desc.error = 0;
1312                         do_generic_file_read(filp,ppos,&desc,file_read_actor);
1313                         retval += desc.written;
1314                         if (desc.error) {
1315                                 retval = retval ?: desc.error;
1316                                 break;
1317                         }
1318                         if (desc.count > 0)
1319                                 break;
1320                 }
1321         }
1322 out:
1323         return retval;
1324 }
1325 EXPORT_SYMBOL(generic_file_aio_read);
1326 
1327 static ssize_t
1328 do_readahead(struct address_space *mapping, struct file *filp,
1329              pgoff_t index, unsigned long nr)
1330 {
1331         if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1332                 return -EINVAL;
1333 
1334         force_page_cache_readahead(mapping, filp, index,
1335                                         max_sane_readahead(nr));
1336         return 0;
1337 }
1338 
1339 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1340 {
1341         ssize_t ret;
1342         struct file *file;
1343 
1344         ret = -EBADF;
1345         file = fget(fd);
1346         if (file) {
1347                 if (file->f_mode & FMODE_READ) {
1348                         struct address_space *mapping = file->f_mapping;
1349                         pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1350                         pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1351                         unsigned long len = end - start + 1;
1352                         ret = do_readahead(mapping, file, start, len);
1353                 }
1354                 fput(file);
1355         }
1356         return ret;
1357 }
1358 
1359 #ifdef CONFIG_MMU
1360 /**
1361  * page_cache_read - adds requested page to the page cache if not already there
1362  * @file:       file to read
1363  * @offset:     page index
1364  *
1365  * This adds the requested page to the page cache if it isn't already there,
1366  * and schedules an I/O to read in its contents from disk.
1367  */
1368 static int page_cache_read(struct file *file, pgoff_t offset)
1369 {
1370         struct address_space *mapping = file->f_mapping;
1371         struct page *page; 
1372         int ret;
1373 
1374         do {
1375                 page = page_cache_alloc_cold(mapping);
1376                 if (!page)
1377                         return -ENOMEM;
1378 
1379                 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1380                 if (ret == 0)
1381                         ret = mapping->a_ops->readpage(file, page);
1382                 else if (ret == -EEXIST)
1383                         ret = 0; /* losing race to add is OK */
1384 
1385                 page_cache_release(page);
1386 
1387         } while (ret == AOP_TRUNCATED_PAGE);
1388                 
1389         return ret;
1390 }
1391 
1392 #define MMAP_LOTSAMISS  (100)
1393 
1394 /**
1395  * filemap_fault - read in file data for page fault handling
1396  * @vma:        vma in which the fault was taken
1397  * @vmf:        struct vm_fault containing details of the fault
1398  *
1399  * filemap_fault() is invoked via the vma operations vector for a
1400  * mapped memory region to read in file data during a page fault.
1401  *
1402  * The goto's are kind of ugly, but this streamlines the normal case of having
1403  * it in the page cache, and handles the special cases reasonably without
1404  * having a lot of duplicated code.
1405  */
1406 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1407 {
1408         int error;
1409         struct file *file = vma->vm_file;
1410         struct address_space *mapping = file->f_mapping;
1411         struct file_ra_state *ra = &file->f_ra;
1412         struct inode *inode = mapping->host;
1413         struct page *page;
1414         pgoff_t size;
1415         int did_readaround = 0;
1416         int ret = 0;
1417 
1418         size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1419         if (vmf->pgoff >= size)
1420                 return VM_FAULT_SIGBUS;
1421 
1422         /* If we don't want any read-ahead, don't bother */
1423         if (VM_RandomReadHint(vma))
1424                 goto no_cached_page;
1425 
1426         /*
1427          * Do we have something in the page cache already?
1428          */
1429 retry_find:
1430         page = find_lock_page(mapping, vmf->pgoff);
1431         /*
1432          * For sequential accesses, we use the generic readahead logic.
1433          */
1434         if (VM_SequentialReadHint(vma)) {
1435                 if (!page) {
1436                         page_cache_sync_readahead(mapping, ra, file,
1437                                                            vmf->pgoff, 1);
1438                         page = find_lock_page(mapping, vmf->pgoff);
1439                         if (!page)
1440                                 goto no_cached_page;
1441                 }
1442                 if (PageReadahead(page)) {
1443                         page_cache_async_readahead(mapping, ra, file, page,
1444                                                            vmf->pgoff, 1);
1445                 }
1446         }
1447 
1448         if (!page) {
1449                 unsigned long ra_pages;
1450 
1451                 ra->mmap_miss++;
1452 
1453                 /*
1454                  * Do we miss much more than hit in this file? If so,
1455                  * stop bothering with read-ahead. It will only hurt.
1456                  */
1457                 if (ra->mmap_miss > MMAP_LOTSAMISS)
1458                         goto no_cached_page;
1459 
1460                 /*
1461                  * To keep the pgmajfault counter straight, we need to
1462                  * check did_readaround, as this is an inner loop.
1463                  */
1464                 if (!did_readaround) {
1465                         ret = VM_FAULT_MAJOR;
1466                         count_vm_event(PGMAJFAULT);
1467                 }
1468                 did_readaround = 1;
1469                 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
1470                 if (ra_pages) {
1471                         pgoff_t start = 0;
1472 
1473                         if (vmf->pgoff > ra_pages / 2)
1474                                 start = vmf->pgoff - ra_pages / 2;
1475                         do_page_cache_readahead(mapping, file, start, ra_pages);
1476                 }
1477                 page = find_lock_page(mapping, vmf->pgoff);
1478                 if (!page)
1479                         goto no_cached_page;
1480         }
1481 
1482         if (!did_readaround)
1483                 ra->mmap_miss--;
1484 
1485         /*
1486          * We have a locked page in the page cache, now we need to check
1487          * that it's up-to-date. If not, it is going to be due to an error.
1488          */
1489         if (unlikely(!PageUptodate(page)))
1490                 goto page_not_uptodate;
1491 
1492         /* Must recheck i_size under page lock */
1493         size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1494         if (unlikely(vmf->pgoff >= size)) {
1495                 unlock_page(page);
1496                 page_cache_release(page);
1497                 return VM_FAULT_SIGBUS;
1498         }
1499 
1500         /*
1501          * Found the page and have a reference on it.
1502          */
1503         mark_page_accessed(page);
1504         ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1505         vmf->page = page;
1506         return ret | VM_FAULT_LOCKED;
1507 
1508 no_cached_page:
1509         /*
1510          * We're only likely to ever get here if MADV_RANDOM is in
1511          * effect.
1512          */
1513         error = page_cache_read(file, vmf->pgoff);
1514 
1515         /*
1516          * The page we want has now been added to the page cache.
1517          * In the unlikely event that someone removed it in the
1518          * meantime, we'll just come back here and read it again.
1519          */
1520         if (error >= 0)
1521                 goto retry_find;
1522 
1523         /*
1524          * An error return from page_cache_read can result if the
1525          * system is low on memory, or a problem occurs while trying
1526          * to schedule I/O.
1527          */
1528         if (error == -ENOMEM)
1529                 return VM_FAULT_OOM;
1530         return VM_FAULT_SIGBUS;
1531 
1532 page_not_uptodate:
1533         /* IO error path */
1534         if (!did_readaround) {
1535                 ret = VM_FAULT_MAJOR;
1536                 count_vm_event(PGMAJFAULT);
1537         }
1538 
1539         /*
1540          * Umm, take care of errors if the page isn't up-to-date.
1541          * Try to re-read it _once_. We do this synchronously,
1542          * because there really aren't any performance issues here
1543          * and we need to check for errors.
1544          */
1545         ClearPageError(page);
1546         error = mapping->a_ops->readpage(file, page);
1547         page_cache_release(page);
1548 
1549         if (!error || error == AOP_TRUNCATED_PAGE)
1550                 goto retry_find;
1551 
1552         /* Things didn't work out. Return zero to tell the mm layer so. */
1553         shrink_readahead_size_eio(file, ra);
1554         return VM_FAULT_SIGBUS;
1555 }
1556 EXPORT_SYMBOL(filemap_fault);
1557 
1558 struct vm_operations_struct generic_file_vm_ops = {
1559         .fault          = filemap_fault,
1560 };
1561 
1562 /* This is used for a general mmap of a disk file */
1563 
1564 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1565 {
1566         struct address_space *mapping = file->f_mapping;
1567 
1568         if (!mapping->a_ops->readpage)
1569                 return -ENOEXEC;
1570         file_accessed(file);
1571         vma->vm_ops = &generic_file_vm_ops;
1572         vma->vm_flags |= VM_CAN_NONLINEAR;
1573         return 0;
1574 }
1575 
1576 /*
1577  * This is for filesystems which do not implement ->writepage.
1578  */
1579 int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
1580 {
1581         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
1582                 return -EINVAL;
1583         return generic_file_mmap(file, vma);
1584 }
1585 #else
1586 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1587 {
1588         return -ENOSYS;
1589 }
1590 int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma)
1591 {
1592         return -ENOSYS;
1593 }
1594 #endif /* CONFIG_MMU */
1595 
1596 EXPORT_SYMBOL(generic_file_mmap);
1597 EXPORT_SYMBOL(generic_file_readonly_mmap);
1598 
1599 static struct page *__read_cache_page(struct address_space *mapping,
1600                                 pgoff_t index,
1601                                 int (*filler)(void *,struct page*),
1602                                 void *data)
1603 {
1604         struct page *page;
1605         int err;
1606 repeat:
1607         page = find_get_page(mapping, index);
1608         if (!page) {
1609                 page = page_cache_alloc_cold(mapping);
1610                 if (!page)
1611                         return ERR_PTR(-ENOMEM);
1612                 err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
1613                 if (unlikely(err)) {
1614                         page_cache_release(page);
1615                         if (err == -EEXIST)
1616                                 goto repeat;
1617                         /* Presumably ENOMEM for radix tree node */
1618                         return ERR_PTR(err);
1619                 }
1620                 err = filler(data, page);
1621                 if (err < 0) {
1622                         page_cache_release(page);
1623                         page = ERR_PTR(err);
1624                 }
1625         }
1626         return page;
1627 }
1628 
1629 /**
1630  * read_cache_page_async - read into page cache, fill it if needed
1631  * @mapping:    the page's address_space
1632  * @index:      the page index
1633  * @filler:     function to perform the read
1634  * @data:       destination for read data
1635  *
1636  * Same as read_cache_page, but don't wait for page to become unlocked
1637  * after submitting it to the filler.
1638  *
1639  * Read into the page cache. If a page already exists, and PageUptodate() is
1640  * not set, try to fill the page but don't wait for it to become unlocked.
1641  *
1642  * If the page does not get brought uptodate, return -EIO.
1643  */
1644 struct page *read_cache_page_async(struct address_space *mapping,
1645                                 pgoff_t index,
1646                                 int (*filler)(void *,struct page*),
1647                                 void *data)
1648 {
1649         struct page *page;
1650         int err;
1651 
1652 retry:
1653         page = __read_cache_page(mapping, index, filler, data);
1654         if (IS_ERR(page))
1655                 return page;
1656         if (PageUptodate(page))
1657                 goto out;
1658 
1659         lock_page(page);
1660         if (!page->mapping) {
1661                 unlock_page(page);
1662                 page_cache_release(page);
1663                 goto retry;
1664         }
1665         if (PageUptodate(page)) {
1666                 unlock_page(page);
1667                 goto out;
1668         }
1669         err = filler(data, page);
1670         if (err < 0) {
1671                 page_cache_release(page);
1672                 return ERR_PTR(err);
1673         }
1674 out:
1675         mark_page_accessed(page);
1676         return page;
1677 }
1678 EXPORT_SYMBOL(read_cache_page_async);
1679 
1680 /**
1681  * read_cache_page - read into page cache, fill it if needed
1682  * @mapping:    the page's address_space
1683  * @index:      the page index
1684  * @filler:     function to perform the read
1685  * @data:       destination for read data
1686  *
1687  * Read into the page cache. If a page already exists, and PageUptodate() is
1688  * not set, try to fill the page then wait for it to become unlocked.
1689  *
1690  * If the page does not get brought uptodate, return -EIO.
1691  */
1692 struct page *read_cache_page(struct address_space *mapping,
1693                                 pgoff_t index,
1694                                 int (*filler)(void *,struct page*),
1695                                 void *data)
1696 {
1697         struct page *page;
1698 
1699         page = read_cache_page_async(mapping, index, filler, data);
1700         if (IS_ERR(page))
1701                 goto out;
1702         wait_on_page_locked(page);
1703         if (!PageUptodate(page)) {
1704                 page_cache_release(page);
1705                 page = ERR_PTR(-EIO);
1706         }
1707  out:
1708         return page;
1709 }
1710 EXPORT_SYMBOL(read_cache_page);
1711 
1712 /*
1713  * The logic we want is
1714  *
1715  *      if suid or (sgid and xgrp)
1716  *              remove privs
1717  */
1718 int should_remove_suid(struct dentry *dentry)
1719 {
1720         mode_t mode = dentry->d_inode->i_mode;
1721         int kill = 0;
1722 
1723         /* suid always must be killed */
1724         if (unlikely(mode & S_ISUID))
1725                 kill = ATTR_KILL_SUID;
1726 
1727         /*
1728          * sgid without any exec bits is just a mandatory locking mark; leave
1729          * it alone.  If some exec bits are set, it's a real sgid; kill it.
1730          */
1731         if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1732                 kill |= ATTR_KILL_SGID;
1733 
1734         if (unlikely(kill && !capable(CAP_FSETID)))
1735                 return kill;
1736 
1737         return 0;
1738 }
1739 EXPORT_SYMBOL(should_remove_suid);
1740 
1741 int __remove_suid(struct dentry *dentry, int kill)
1742 {
1743         struct iattr newattrs;
1744 
1745         newattrs.ia_valid = ATTR_FORCE | kill;
1746         return notify_change(dentry, &newattrs);
1747 }
1748 
1749 int remove_suid(struct dentry *dentry)
1750 {
1751         int killsuid = should_remove_suid(dentry);
1752         int killpriv = security_inode_need_killpriv(dentry);
1753         int error = 0;
1754 
1755         if (killpriv < 0)
1756                 return killpriv;
1757         if (killpriv)
1758                 error = security_inode_killpriv(dentry);
1759         if (!error && killsuid)
1760                 error = __remove_suid(dentry, killsuid);
1761 
1762         return error;
1763 }
1764 EXPORT_SYMBOL(remove_suid);
1765 
1766 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
1767                         const struct iovec *iov, size_t base, size_t bytes)
1768 {
1769         size_t copied = 0, left = 0;
1770 
1771         while (bytes) {
1772                 char __user *buf = iov->iov_base + base;
1773                 int copy = min(bytes, iov->iov_len - base);
1774 
1775                 base = 0;
1776                 left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
1777                 copied += copy;
1778                 bytes -= copy;
1779                 vaddr += copy;
1780                 iov++;
1781 
1782                 if (unlikely(left))
1783                         break;
1784         }
1785         return copied - left;
1786 }
1787 
1788 /*
1789  * Copy as much as we can into the page and return the number of bytes which
1790  * were sucessfully copied.  If a fault is encountered then return the number of
1791  * bytes which were copied.
1792  */
1793 size_t iov_iter_copy_from_user_atomic(struct page *page,
1794                 struct iov_iter *i, unsigned long offset, size_t bytes)
1795 {
1796         char *kaddr;
1797         size_t copied;
1798 
1799 #ifndef CONFIG_PREEMPT_RT
1800         BUG_ON(!current->pagefault_disabled);
1801 #endif
1802         kaddr = kmap_atomic(page, KM_USER0);
1803         if (likely(i->nr_segs == 1)) {
1804                 int left;
1805                 char __user *buf = i->iov->iov_base + i->iov_offset;
1806                 left = __copy_from_user_inatomic_nocache(kaddr + offset,
1807                                                         buf, bytes);
1808                 copied = bytes - left;
1809         } else {
1810                 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1811                                                 i->iov, i->iov_offset, bytes);
1812         }
1813         kunmap_atomic(kaddr, KM_USER0);
1814 
1815         return copied;
1816 }
1817 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1818 
1819 /*
1820  * This has the same sideeffects and return value as
1821  * iov_iter_copy_from_user_atomic().
1822  * The difference is that it attempts to resolve faults.
1823  * Page must not be locked.
1824  */
1825 size_t iov_iter_copy_from_user(struct page *page,
1826                 struct iov_iter *i, unsigned long offset, size_t bytes)
1827 {
1828         char *kaddr;
1829         size_t copied;
1830 
1831         kaddr = kmap(page);
1832         if (likely(i->nr_segs == 1)) {
1833                 int left;
1834                 char __user *buf = i->iov->iov_base + i->iov_offset;
1835                 left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
1836                 copied = bytes - left;
1837         } else {
1838                 copied = __iovec_copy_from_user_inatomic(kaddr + offset,
1839                                                 i->iov, i->iov_offset, bytes);
1840         }
1841         kunmap(page);
1842         return copied;
1843 }
1844 EXPORT_SYMBOL(iov_iter_copy_from_user);
1845 
1846 void iov_iter_advance(struct iov_iter *i, size_t bytes)
1847 {
1848         BUG_ON(i->count < bytes);
1849 
1850         if (likely(i->nr_segs == 1)) {
1851                 i->iov_offset += bytes;
1852                 i->count -= bytes;
1853         } else {
1854                 const struct iovec *iov = i->iov;
1855                 size_t base = i->iov_offset;
1856 
1857                 /*
1858                  * The !iov->iov_len check ensures we skip over unlikely
1859                  * zero-length segments (without overruning the iovec).
1860                  */
1861                 while (bytes || unlikely(!iov->iov_len && i->count)) {
1862                         int copy;
1863 
1864                         copy = min(bytes, iov->iov_len - base);
1865                         BUG_ON(!i->count || i->count < copy);
1866                         i->count -= copy;
1867                         bytes -= copy;
1868                         base += copy;
1869                         if (iov->iov_len == base) {
1870                                 iov++;
1871                                 base = 0;
1872                         }
1873                 }
1874                 i->iov = iov;
1875                 i->iov_offset = base;
1876         }
1877 }
1878 EXPORT_SYMBOL(iov_iter_advance);
1879 
1880 /*
1881  * Fault in the first iovec of the given iov_iter, to a maximum length
1882  * of bytes. Returns 0 on success, or non-zero if the memory could not be
1883  * accessed (ie. because it is an invalid address).
1884  *
1885  * writev-intensive code may want this to prefault several iovecs -- that
1886  * would be possible (callers must not rely on the fact that _only_ the
1887  * first iovec will be faulted with the current implementation).
1888  */
1889 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1890 {
1891         char __user *buf = i->iov->iov_base + i->iov_offset;
1892         bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1893         return fault_in_pages_readable(buf, bytes);
1894 }
1895 EXPORT_SYMBOL(iov_iter_fault_in_readable);
1896 
1897 /*
1898  * Return the count of just the current iov_iter segment.
1899  */
1900 size_t iov_iter_single_seg_count(struct iov_iter *i)
1901 {
1902         const struct iovec *iov = i->iov;
1903         if (i->nr_segs == 1)
1904                 return i->count;
1905         else
1906                 return min(i->count, iov->iov_len - i->iov_offset);
1907 }
1908 EXPORT_SYMBOL(iov_iter_single_seg_count);
1909 
1910 /*
1911  * Performs necessary checks before doing a write
1912  *
1913  * Can adjust writing position or amount of bytes to write.
1914  * Returns appropriate error code that caller should return or
1915  * zero in case that write should be allowed.
1916  */
1917 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
1918 {
1919         struct inode *inode = file->f_mapping->host;
1920         unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1921 
1922         if (unlikely(*pos < 0))
1923                 return -EINVAL;
1924 
1925         if (!isblk) {
1926                 /* FIXME: this is for backwards compatibility with 2.4 */
1927                 if (file->f_flags & O_APPEND)
1928                         *pos = i_size_read(inode);
1929 
1930                 if (limit != RLIM_INFINITY) {
1931                         if (*pos >= limit) {
1932                                 send_sig(SIGXFSZ, current, 0);
1933                                 return -EFBIG;
1934                         }
1935                         if (*count > limit - (typeof(limit))*pos) {
1936                                 *count = limit - (typeof(limit))*pos;
1937                         }
1938                 }
1939         }
1940 
1941         /*
1942          * LFS rule
1943          */
1944         if (unlikely(*pos + *count > MAX_NON_LFS &&
1945                                 !(file->f_flags & O_LARGEFILE))) {
1946                 if (*pos >= MAX_NON_LFS) {
1947                         return -EFBIG;
1948                 }
1949                 if (*count > MAX_NON_LFS - (unsigned long)*pos) {
1950                         *count = MAX_NON_LFS - (unsigned long)*pos;
1951                 }
1952         }
1953 
1954         /*
1955          * Are we about to exceed the fs block limit ?
1956          *
1957          * If we have written data it becomes a short write.  If we have
1958          * exceeded without writing data we send a signal and return EFBIG.
1959          * Linus frestrict idea will clean these up nicely..
1960          */
1961         if (likely(!isblk)) {
1962                 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
1963                         if (*count || *pos > inode->i_sb->s_maxbytes) {
1964                                 return -EFBIG;
1965                         }
1966                         /* zero-length writes at ->s_maxbytes are OK */
1967                 }
1968 
1969                 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
1970                         *count = inode->i_sb->s_maxbytes - *pos;
1971         } else {
1972 #ifdef CONFIG_BLOCK
1973                 loff_t isize;
1974                 if (bdev_read_only(I_BDEV(inode)))
1975                         return -EPERM;
1976                 isize = i_size_read(inode);
1977                 if (*pos >= isize) {
1978                         if (*count || *pos > isize)
1979                                 return -ENOSPC;
1980                 }
1981 
1982                 if (*pos + *count > isize)
1983                         *count = isize - *pos;
1984 #else
1985                 return -EPERM;
1986 #endif
1987         }
1988         return 0;
1989 }
1990 EXPORT_SYMBOL(generic_write_checks);
1991 
1992 int pagecache_write_begin(struct file *file, struct address_space *mapping,
1993                                 loff_t pos, unsigned len, unsigned flags,
1994                                 struct page **pagep, void **fsdata)
1995 {
1996         const struct address_space_operations *aops = mapping->a_ops;
1997 
1998         if (aops->write_begin) {
1999                 return aops->write_begin(file, mapping, pos, len, flags,
2000                                                         pagep, fsdata);
2001         } else {
2002                 int ret;
2003                 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2004                 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
2005                 struct inode *inode = mapping->host;
2006                 struct page *page;
2007 again:
2008                 page = __grab_cache_page(mapping, index);
2009                 *pagep = page;
2010                 if (!page)
2011                         return -ENOMEM;
2012 
2013                 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
2014                         /*
2015                          * There is no way to resolve a short write situation
2016                          * for a !Uptodate page (except by double copying in
2017                          * the caller done by generic_perform_write_2copy).
2018                          *
2019                          * Instead, we have to bring it uptodate here.
2020                          */
2021                         ret = aops->readpage(file, page);
2022                         page_cache_release(page);
2023                         if (ret) {
2024                                 if (ret == AOP_TRUNCATED_PAGE)
2025                                         goto again;
2026                                 return ret;
2027                         }
2028                         goto again;
2029                 }
2030 
2031                 ret = aops->prepare_write(file, page, offset, offset+len);
2032                 if (ret) {
2033                         unlock_page(page);
2034                         page_cache_release(page);
2035                         if (pos + len > inode->i_size)
2036                                 vmtruncate(inode, inode->i_size);
2037                 }
2038                 return ret;
2039         }
2040 }
2041 EXPORT_SYMBOL(pagecache_write_begin);
2042 
2043 int pagecache_write_end(struct file *file, struct address_space *mapping,
2044                                 loff_t pos, unsigned len, unsigned copied,
2045                                 struct page *page, void *fsdata)
2046 {
2047         const struct address_space_operations *aops = mapping->a_ops;
2048         int ret;
2049 
2050         if (aops->write_end) {
2051                 mark_page_accessed(page);
2052                 ret = aops->write_end(file, mapping, pos, len, copied,
2053                                                         page, fsdata);
2054         } else {
2055                 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
2056                 struct inode *inode = mapping->host;
2057 
2058                 flush_dcache_page(page);
2059                 ret = aops->commit_write(file, page, offset, offset+len);
2060                 unlock_page(page);
2061                 mark_page_accessed(page);
2062                 page_cache_release(page);
2063 
2064                 if (ret < 0) {
2065                         if (pos + len > inode->i_size)
2066                                 vmtruncate(inode, inode->i_size);
2067                 } else if (ret > 0)
2068                         ret = min_t(size_t, copied, ret);
2069                 else
2070                         ret = copied;
2071         }
2072 
2073         return ret;
2074 }
2075 EXPORT_SYMBOL(pagecache_write_end);
2076 
2077 ssize_t
2078 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2079                 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
2080                 size_t count, size_t ocount)
2081 {
2082         struct file     *file = iocb->ki_filp;
2083         struct address_space *mapping = file->f_mapping;
2084         struct inode    *inode = mapping->host;
2085         ssize_t         written;
2086 
2087         if (count != ocount)
2088                 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2089 
2090         written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
2091         if (written > 0) {
2092                 loff_t end = pos + written;
2093                 if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
2094                         i_size_write(inode,  end);
2095                         mark_inode_dirty(inode);
2096                 }
2097                 *ppos = end;
2098         }
2099 
2100         /*
2101          * Sync the fs metadata but not the minor inode changes and
2102          * of course not the data as we did direct DMA for the IO.
2103          * i_mutex is held, which protects generic_osync_inode() from
2104          * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
2105          */
2106         if ((written >= 0 || written == -EIOCBQUEUED) &&
2107             ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2108                 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2109                 if (err < 0)
2110                         written = err;
2111         }
2112         return written;
2113 }
2114 EXPORT_SYMBOL(generic_file_direct_write);
2115 
2116 /*
2117  * Find or create a page at the given pagecache position. Return the locked
2118  * page. This function is specifically for buffered writes.
2119  */
2120 struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
2121 {
2122         int status;
2123         struct page *page;
2124 repeat:
2125         page = find_lock_page(mapping, index);
2126         if (likely(page))
2127                 return page;
2128 
2129         page = page_cache_alloc(mapping);
2130         if (!page)
2131                 return NULL;
2132         status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
2133         if (unlikely(status)) {
2134                 page_cache_release(page);
2135                 if (status == -EEXIST)
2136                         goto repeat;
2137                 return NULL;
2138         }
2139         return page;
2140 }
2141 EXPORT_SYMBOL(__grab_cache_page);
2142 
2143 static ssize_t generic_perform_write_2copy(struct file *file,
2144                                 struct iov_iter *i, loff_t pos)
2145 {
2146         struct address_space *mapping = file->f_mapping;
2147         const struct address_space_operations *a_ops = mapping->a_ops;
2148         struct inode *inode = mapping->host;
2149         long status = 0;
2150         ssize_t written = 0;
2151 
2152         do {
2153                 struct page *src_page;
2154                 struct page *page;
2155                 pgoff_t index;          /* Pagecache index for current page */
2156                 unsigned long offset;   /* Offset into pagecache page */
2157                 unsigned long bytes;    /* Bytes to write to page */
2158                 size_t copied;          /* Bytes copied from user */
2159 
2160                 offset = (pos & (PAGE_CACHE_SIZE - 1));
2161                 index = pos >> PAGE_CACHE_SHIFT;
2162                 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2163                                                 iov_iter_count(i));
2164 
2165                 /*
2166                  * a non-NULL src_page indicates that we're doing the
2167                  * copy via get_user_pages and kmap.
2168                  */
2169                 src_page = NULL;
2170 
2171                 /*
2172                  * Bring in the user page that we will copy from _first_.
2173                  * Otherwise there's a nasty deadlock on copying from the
2174                  * same page as we're writing to, without it being marked
2175                  * up-to-date.
2176                  *
2177                  * Not only is this an optimisation, but it is also required
2178                  * to check that the address is actually valid, when atomic
2179                  * usercopies are used, below.
2180                  */
2181                 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2182                         status = -EFAULT;
2183                         break;
2184                 }
2185 
2186                 page = __grab_cache_page(mapping, index);
2187                 if (!page) {
2188                         status = -ENOMEM;
2189                         break;
2190                 }
2191 
2192                 /*
2193                  * non-uptodate pages cannot cope with short copies, and we
2194                  * cannot take a pagefault with the destination page locked.
2195                  * So pin the source page to copy it.
2196                  */
2197                 if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2198                         unlock_page(page);
2199 
2200                         src_page = alloc_page(GFP_KERNEL);
2201                         if (!src_page) {
2202                                 page_cache_release(page);
2203                                 status = -ENOMEM;
2204                                 break;
2205                         }
2206 
2207                         /*
2208                          * Cannot get_user_pages with a page locked for the
2209                          * same reason as we can't take a page fault with a
2210                          * page locked (as explained below).
2211                          */
2212                         copied = iov_iter_copy_from_user(src_page, i,
2213                                                                 offset, bytes);
2214                         if (unlikely(copied == 0)) {
2215                                 status = -EFAULT;
2216                                 page_cache_release(page);
2217                                 page_cache_release(src_page);
2218                                 break;
2219                         }
2220                         bytes = copied;
2221 
2222                         lock_page(page);
2223                         /*
2224                          * Can't handle the page going uptodate here, because
2225                          * that means we would use non-atomic usercopies, which
2226                          * zero out the tail of the page, which can cause
2227                          * zeroes to become transiently visible. We could just
2228                          * use a non-zeroing copy, but the APIs aren't too
2229                          * consistent.
2230                          */
2231                         if (unlikely(!page->mapping || PageUptodate(page))) {
2232                                 unlock_page(page);
2233                                 page_cache_release(page);
2234                                 page_cache_release(src_page);
2235                                 continue;
2236                         }
2237                 }
2238 
2239                 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2240                 if (unlikely(status))
2241                         goto fs_write_aop_error;
2242 
2243                 if (!src_page) {
2244                         /*
2245                          * Must not enter the pagefault handler here, because
2246                          * we hold the page lock, so we might recursively
2247                          * deadlock on the same lock, or get an ABBA deadlock
2248                          * against a different lock, or against the mmap_sem
2249                          * (which nests outside the page lock).  So increment
2250                          * preempt count, and use _atomic usercopies.
2251                          *
2252                          * The page is uptodate so we are OK to encounter a
2253                          * short copy: if unmodified parts of the page are
2254                          * marked dirty and written out to disk, it doesn't
2255                          * really matter.
2256                          */
2257                         pagefault_disable();
2258                         copied = iov_iter_copy_from_user_atomic(page, i,
2259                                                                 offset, bytes);
2260                         pagefault_enable();
2261                 } else {
2262                         void *src, *dst;
2263                         src = kmap_atomic(src_page, KM_USER0);
2264                         dst = kmap_atomic(page, KM_USER1);
2265                         memcpy(dst + offset, src + offset, bytes);
2266                         kunmap_atomic(dst, KM_USER1);
2267                         kunmap_atomic(src, KM_USER0);
2268                         copied = bytes;
2269                 }
2270                 flush_dcache_page(page);
2271 
2272                 status = a_ops->commit_write(file, page, offset, offset+bytes);
2273                 if (unlikely(status < 0))
2274                         goto fs_write_aop_error;
2275                 if (unlikely(status > 0)) /* filesystem did partial write */
2276                         copied = min_t(size_t, copied, status);
2277 
2278                 unlock_page(page);
2279                 mark_page_accessed(page);
2280                 page_cache_release(page);
2281                 if (src_page)
2282                         page_cache_release(src_page);
2283 
2284                 iov_iter_advance(i, copied);
2285                 pos += copied;
2286                 written += copied;
2287 
2288                 balance_dirty_pages_ratelimited(mapping);
2289                 cond_resched();
2290                 continue;
2291 
2292 fs_write_aop_error:
2293                 unlock_page(page);
2294                 page_cache_release(page);
2295                 if (src_page)
2296                         page_cache_release(src_page);
2297 
2298                 /*
2299                  * prepare_write() may have instantiated a few blocks
2300                  * outside i_size.  Trim these off again. Don't need
2301                  * i_size_read because we hold i_mutex.
2302                  */
2303                 if (pos + bytes > inode->i_size)
2304                         vmtruncate(inode, inode->i_size);
2305                 break;
2306         } while (iov_iter_count(i));
2307 
2308         return written ? written : status;
2309 }
2310 
2311 static ssize_t generic_perform_write(struct file *file,
2312                                 struct iov_iter *i, loff_t pos)
2313 {
2314         struct address_space *mapping = file->f_mapping;
2315         const struct address_space_operations *a_ops = mapping->a_ops;
2316         long status = 0;
2317         ssize_t written = 0;
2318         unsigned int flags = 0;
2319 
2320         /*
2321          * Copies from kernel address space cannot fail (NFSD is a big user).
2322          */
2323         if (segment_eq(get_fs(), KERNEL_DS))
2324                 flags |= AOP_FLAG_UNINTERRUPTIBLE;
2325 
2326         do {
2327                 struct page *page;
2328                 pgoff_t index;          /* Pagecache index for current page */
2329                 unsigned long offset;   /* Offset into pagecache page */
2330                 unsigned long bytes;    /* Bytes to write to page */
2331                 size_t copied;          /* Bytes copied from user */
2332                 void *fsdata;
2333 
2334                 offset = (pos & (PAGE_CACHE_SIZE - 1));
2335                 index = pos >> PAGE_CACHE_SHIFT;
2336                 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2337                                                 iov_iter_count(i));
2338 
2339 again:
2340 
2341                 /*
2342                  * Bring in the user page that we will copy from _first_.
2343                  * Otherwise there's a nasty deadlock on copying from the
2344                  * same page as we're writing to, without it being marked
2345                  * up-to-date.
2346                  *
2347                  * Not only is this an optimisation, but it is also required
2348                  * to check that the address is actually valid, when atomic
2349                  * usercopies are used, below.
2350                  */
2351                 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2352                         status = -EFAULT;
2353                         break;
2354                 }
2355 
2356                 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
2357                                                 &page, &fsdata);
2358                 if (unlikely(status))
2359                         break;
2360 
2361                 pagefault_disable();
2362                 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2363                 pagefault_enable();
2364                 flush_dcache_page(page);
2365 
2366                 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2367                                                 page, fsdata);
2368                 if (unlikely(status < 0))
2369                         break;
2370                 copied = status;
2371 
2372                 cond_resched();
2373 
2374                 iov_iter_advance(i, copied);
2375                 if (unlikely(copied == 0)) {
2376                         /*
2377                          * If we were unable to copy any data at all, we must
2378                          * fall back to a single segment length write.
2379                          *
2380                          * If we didn't fallback here, we could livelock
2381                          * because not all segments in the iov can be copied at
2382                          * once without a pagefault.
2383                          */
2384                         bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2385                                                 iov_iter_single_seg_count(i));
2386                         goto again;
2387                 }
2388                 pos += copied;
2389                 written += copied;
2390 
2391                 balance_dirty_pages_ratelimited(mapping);
2392 
2393         } while (iov_iter_count(i));
2394 
2395         return written ? written : status;
2396 }
2397 
2398 ssize_t
2399 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2400                 unsigned long nr_segs, loff_t pos, loff_t *ppos,
2401                 size_t count, ssize_t written)
2402 {
2403         struct file *file = iocb->ki_filp;
2404         struct address_space *mapping = file->f_mapping;
2405         const struct address_space_operations *a_ops = mapping->a_ops;
2406         struct inode *inode = mapping->host;
2407         ssize_t status;
2408         struct iov_iter i;
2409 
2410         iov_iter_init(&i, iov, nr_segs, count, written);
2411         if (a_ops->write_begin)
2412                 status = generic_perform_write(file, &i, pos);
2413         else
2414                 status = generic_perform_write_2copy(file, &i, pos);
2415 
2416         if (likely(status >= 0)) {
2417                 written += status;
2418                 *ppos = pos + status;
2419 
2420                 /*
2421                  * For now, when the user asks for O_SYNC, we'll actually give
2422                  * O_DSYNC
2423                  */
2424                 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2425                         if (!a_ops->writepage || !is_sync_kiocb(iocb))
2426                                 status = generic_osync_inode(inode, mapping,
2427                                                 OSYNC_METADATA|OSYNC_DATA);
2428                 }
2429         }
2430         
2431         /*
2432          * If we get here for O_DIRECT writes then we must have fallen through
2433          * to buffered writes (block instantiation inside i_size).  So we sync
2434          * the file data here, to try to honour O_DIRECT expectations.
2435          */
2436         if (unlikely(file->f_flags & O_DIRECT) && written)
2437                 status = filemap_write_and_wait(mapping);
2438 
2439         return written ? written : status;
2440 }
2441 EXPORT_SYMBOL(generic_file_buffered_write);
2442 
2443 static ssize_t
2444 __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2445                                 unsigned long nr_segs, loff_t *ppos)
2446 {
2447         struct file *file = iocb->ki_filp;
2448         struct address_space * mapping = file->f_mapping;
2449         size_t ocount;          /* original count */
2450         size_t count;           /* after file limit checks */
2451         struct inode    *inode = mapping->host;
2452         loff_t          pos;
2453         ssize_t         written;
2454         ssize_t         err;
2455 
2456         ocount = 0;
2457         err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
2458         if (err)
2459                 return err;
2460 
2461         count = ocount;
2462         pos = *ppos;
2463 
2464         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
2465 
2466         /* We can write back this queue in page reclaim */
2467         current->backing_dev_info = mapping->backing_dev_info;
2468         written = 0;
2469 
2470         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2471         if (err)
2472                 goto out;
2473 
2474         if (count == 0)
2475                 goto out;
2476 
2477         err = remove_suid(file->f_path.dentry);
2478         if (err)
2479                 goto out;
2480 
2481         file_update_time(file);
2482 
2483         /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2484         if (unlikely(file->f_flags & O_DIRECT)) {
2485                 loff_t endbyte;
2486                 ssize_t written_buffered;
2487 
2488                 written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
2489                                                         ppos, count, ocount);
2490                 if (written < 0 || written == count)
2491                         goto out;
2492                 /*
2493                  * direct-io write to a hole: fall through to buffered I/O
2494                  * for completing the rest of the request.
2495                  */
2496                 pos += written;
2497                 count -= written;
2498                 written_buffered = generic_file_buffered_write(iocb, iov,
2499                                                 nr_segs, pos, ppos, count,
2500                                                 written);
2501                 /*
2502                  * If generic_file_buffered_write() retuned a synchronous error
2503                  * then we want to return the number of bytes which were
2504                  * direct-written, or the error code if that was zero.  Note
2505                  * that this differs from normal direct-io semantics, which
2506                  * will return -EFOO even if some bytes were written.
2507                  */
2508                 if (written_buffered < 0) {
2509                         err = written_buffered;
2510                         goto out;
2511                 }
2512 
2513                 /*
2514                  * We need to ensure that the page cache pages are written to
2515                  * disk and invalidated to preserve the expected O_DIRECT
2516                  * semantics.
2517                  */
2518                 endbyte = pos + written_buffered - written - 1;
2519                 err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
2520                                             SYNC_FILE_RANGE_WAIT_BEFORE|
2521                                             SYNC_FILE_RANGE_WRITE|
2522                                             SYNC_FILE_RANGE_WAIT_AFTER);
2523                 if (err == 0) {
2524                         written = written_buffered;
2525                         invalidate_mapping_pages(mapping,
2526                                                  pos >> PAGE_CACHE_SHIFT,
2527                                                  endbyte >> PAGE_CACHE_SHIFT);
2528                 } else {
2529                         /*
2530                          * We don't know how much we wrote, so just return
2531                          * the number of bytes which were direct-written
2532                          */
2533                 }
2534         } else {
2535                 written = generic_file_buffered_write(iocb, iov, nr_segs,
2536                                 pos, ppos, count, written);
2537         }
2538 out:
2539         current->backing_dev_info = NULL;
2540         return written ? written : err;
2541 }
2542 
2543 ssize_t generic_file_aio_write_nolock(struct kiocb *iocb,
2544                 const struct iovec *iov, unsigned long nr_segs, loff_t pos)
2545 {
2546         struct file *file = iocb->ki_filp;
2547         struct address_space *mapping = file->f_mapping;
2548         struct inode *inode = mapping->host;
2549         ssize_t ret;
2550 
2551         BUG_ON(iocb->ki_pos != pos);
2552 
2553         ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2554                         &iocb->ki_pos);
2555 
2556         if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2557                 ssize_t err;
2558 
2559                 err = sync_page_range_nolock(inode, mapping, pos, ret);
2560                 if (err < 0)
2561                         ret = err;
2562         }
2563         return ret;
2564 }
2565 EXPORT_SYMBOL(generic_file_aio_write_nolock);
2566 
2567 ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2568                 unsigned long nr_segs, loff_t pos)
2569 {
2570         struct file *file = iocb->ki_filp;
2571         struct address_space *mapping = file->f_mapping;
2572         struct inode *inode = mapping->host;
2573         ssize_t ret;
2574 
2575         BUG_ON(iocb->ki_pos != pos);
2576 
2577         mutex_lock(&inode->i_mutex);
2578         ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs,
2579                         &iocb->ki_pos);
2580         mutex_unlock(&inode->i_mutex);
2581 
2582         if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2583                 ssize_t err;
2584 
2585                 err = sync_page_range(inode, mapping, pos, ret);
2586                 if (err < 0)
2587                         ret = err;
2588         }
2589         return ret;
2590 }
2591 EXPORT_SYMBOL(generic_file_aio_write);
2592 
2593 /*
2594  * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
2595  * went wrong during pagecache shootdown.
2596  */
2597 static ssize_t
2598 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
2599         loff_t offset, unsigned long nr_segs)
2600 {
2601         struct file *file = iocb->ki_filp;
2602         struct address_space *mapping = file->f_mapping;
2603         ssize_t retval;
2604         size_t write_len;
2605         pgoff_t end = 0; /* silence gcc */
2606 
2607         /*
2608          * If it's a write, unmap all mmappings of the file up-front.  This
2609          * will cause any pte dirty bits to be propagated into the pageframes
2610          * for the subsequent filemap_write_and_wait().
2611          */
2612         if (rw == WRITE) {
2613                 write_len = iov_length(iov, nr_segs);
2614                 end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
2615                 if (mapping_mapped(mapping))
2616                         unmap_mapping_range(mapping, offset, write_len, 0);
2617         }
2618 
2619         retval = filemap_write_and_wait(mapping);
2620         if (retval)
2621                 goto out;
2622 
2623         /*
2624          * After a write we want buffered reads to be sure to go to disk to get
2625          * the new data.  We invalidate clean cached page from the region we're
2626          * about to write.  We do this *before* the write so that we can return
2627          * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
2628          */
2629         if (rw == WRITE && mapping_nrpages(mapping)) {
2630                 retval = invalidate_inode_pages2_range(mapping,
2631                                         offset >> PAGE_CACHE_SHIFT, end);
2632                 if (retval)
2633                         goto out;
2634         }
2635 
2636         retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
2637 
2638         /*
2639          * Finally, try again to invalidate clean pages which might have been
2640          * cached by non-direct readahead, or faulted in by get_user_pages()
2641          * if the source of the write was an mmap'ed region of the file
2642          * we're writing.  Either one is a pretty crazy thing to do,
2643          * so we don't support it 100%.  If this invalidation
2644          * fails, tough, the write still worked...
2645          */
2646         if (rw == WRITE && mapping_nrpages(mapping)) {
2647                 invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
2648         }
2649 out:
2650         return retval;
2651 }
2652 
2653 /**
2654  * try_to_release_page() - release old fs-specific metadata on a page
2655  *
2656  * @page: the page which the kernel is trying to free
2657  * @gfp_mask: memory allocation flags (and I/O mode)
2658  *
2659  * The address_space is to try to release any data against the page
2660  * (presumably at page->private).  If the release was successful, return `1'.
2661  * Otherwise return zero.
2662  *
2663  * The @gfp_mask argument specifies whether I/O may be performed to release
2664  * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
2665  *
2666  * NOTE: @gfp_mask may go away, and this function may become non-blocking.
2667  */
2668 int try_to_release_page(struct page *page, gfp_t gfp_mask)
2669 {
2670         struct address_space * const mapping = page->mapping;
2671 
2672         BUG_ON(!PageLocked(page));
2673         if (PageWriteback(page))
2674                 return 0;
2675 
2676         if (mapping && mapping->a_ops->releasepage)
2677                 return mapping->a_ops->releasepage(page, gfp_mask);
2678         return try_to_free_buffers(page);
2679 }
2680 
2681 EXPORT_SYMBOL(try_to_release_page);
2682 
  This page was automatically generated by the LXR engine.