- Separate out show_stack changes into own patch.
[linux-flexiantxendom0-3.2.10.git] / mm / mmap.c
1 /*
2  * mm/mmap.c
3  *
4  * Written by obz.
5  *
6  * Address space accounting code        <alan@redhat.com>
7  */
8
9 #include <linux/slab.h>
10 #include <linux/shm.h>
11 #include <linux/mman.h>
12 #include <linux/pagemap.h>
13 #include <linux/swap.h>
14 #include <linux/init.h>
15 #include <linux/file.h>
16 #include <linux/fs.h>
17 #include <linux/personality.h>
18 #include <linux/security.h>
19 #include <linux/hugetlb.h>
20 #include <linux/profile.h>
21
22 #include <asm/uaccess.h>
23 #include <asm/pgalloc.h>
24 #include <asm/tlb.h>
25
26 /*
27  * WARNING: the debugging will use recursive algorithms so never enable this
28  * unless you know what you are doing.
29  */
30 #undef DEBUG_MM_RB
31
32 /* description of effects of mapping type and prot in current implementation.
33  * this is due to the limited x86 page protection hardware.  The expected
34  * behavior is in parens:
35  *
36  * map_type     prot
37  *              PROT_NONE       PROT_READ       PROT_WRITE      PROT_EXEC
38  * MAP_SHARED   r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
39  *              w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
40  *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
41  *              
42  * MAP_PRIVATE  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
43  *              w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
44  *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
45  *
46  */
47 pgprot_t protection_map[16] = {
48         __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
49         __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
50 };
51
52 int sysctl_overcommit_memory = 0;       /* default is heuristic overcommit */
53 int sysctl_overcommit_ratio = 50;       /* default is 50% */
54 atomic_t vm_committed_space = ATOMIC_INIT(0);
55
56 /*
57  * Check that a process has enough memory to allocate a new virtual
58  * mapping. 1 means there is enough memory for the allocation to
59  * succeed and 0 implies there is not.
60  *
61  * We currently support three overcommit policies, which are set via the
62  * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-acounting
63  *
64  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
65  * Additional code 2002 Jul 20 by Robert Love.
66  */
67 int vm_enough_memory(long pages)
68 {
69         unsigned long free, allowed;
70
71         vm_acct_memory(pages);
72
73         /*
74          * Sometimes we want to use more memory than we have
75          */
76         if (sysctl_overcommit_memory == 1)
77                 return 1;
78
79         if (sysctl_overcommit_memory == 0) {
80                 free = get_page_cache_size();
81                 free += nr_free_pages();
82                 free += nr_swap_pages;
83
84                 /*
85                  * The code below doesn't account for free space in the
86                  * inode and dentry slab cache, slab cache fragmentation,
87                  * inodes and dentries which will become freeable under
88                  * VM load, etc. Lets just hope all these (complex)
89                  * factors balance out...
90                  */
91                 free += (dentry_stat.nr_unused * sizeof(struct dentry)) >>
92                         PAGE_SHIFT;
93                 free += (inodes_stat.nr_unused * sizeof(struct inode)) >>
94                         PAGE_SHIFT;
95
96                 if (free > pages)
97                         return 1;
98                 vm_unacct_memory(pages);
99                 return 0;
100         }
101
102         allowed = totalram_pages * sysctl_overcommit_ratio / 100;
103         allowed += total_swap_pages;
104
105         if (atomic_read(&vm_committed_space) < allowed)
106                 return 1;
107
108         vm_unacct_memory(pages);
109
110         return 0;
111 }
112
113 /*
114  * Requires inode->i_mapping->i_shared_sem
115  */
116 static inline void
117 __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
118 {
119         if (inode) {
120                 if (vma->vm_flags & VM_DENYWRITE)
121                         atomic_inc(&inode->i_writecount);
122                 list_del_init(&vma->shared);
123         }
124 }
125
126 /*
127  * Remove one vm structure from the inode's i_mapping address space.
128  */
129 static void remove_shared_vm_struct(struct vm_area_struct *vma)
130 {
131         struct file *file = vma->vm_file;
132
133         if (file) {
134                 struct inode *inode = file->f_dentry->d_inode;
135
136                 down(&inode->i_mapping->i_shared_sem);
137                 __remove_shared_vm_struct(vma, inode);
138                 up(&inode->i_mapping->i_shared_sem);
139         }
140 }
141
142 /*
143  *  sys_brk() for the most part doesn't need the global kernel
144  *  lock, except when an application is doing something nasty
145  *  like trying to un-brk an area that has already been mapped
146  *  to a regular file.  in this case, the unmapping will need
147  *  to invoke file system routines that need the global lock.
148  */
149 asmlinkage unsigned long sys_brk(unsigned long brk)
150 {
151         unsigned long rlim, retval;
152         unsigned long newbrk, oldbrk;
153         struct mm_struct *mm = current->mm;
154
155         down_write(&mm->mmap_sem);
156
157         if (brk < mm->end_code)
158                 goto out;
159         newbrk = PAGE_ALIGN(brk);
160         oldbrk = PAGE_ALIGN(mm->brk);
161         if (oldbrk == newbrk)
162                 goto set_brk;
163
164         /* Always allow shrinking brk. */
165         if (brk <= mm->brk) {
166                 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
167                         goto set_brk;
168                 goto out;
169         }
170
171         /* Check against rlimit.. */
172         rlim = current->rlim[RLIMIT_DATA].rlim_cur;
173         if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
174                 goto out;
175
176         /* Check against existing mmap mappings. */
177         if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
178                 goto out;
179
180         /* Ok, looks good - let it rip. */
181         if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
182                 goto out;
183 set_brk:
184         mm->brk = brk;
185 out:
186         retval = mm->brk;
187         up_write(&mm->mmap_sem);
188         return retval;
189 }
190
191 /* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
192  * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
193  * into "VM_xxx".
194  */
195 static inline unsigned long
196 calc_vm_flags(unsigned long prot, unsigned long flags)
197 {
198 #define _trans(x,bit1,bit2) \
199 ((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
200
201         unsigned long prot_bits, flag_bits;
202         prot_bits =
203                 _trans(prot, PROT_READ, VM_READ) |
204                 _trans(prot, PROT_WRITE, VM_WRITE) |
205                 _trans(prot, PROT_EXEC, VM_EXEC);
206         flag_bits =
207                 _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
208                 _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
209                 _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
210         return prot_bits | flag_bits;
211 #undef _trans
212 }
213
214 #ifdef DEBUG_MM_RB
215 static int browse_rb(struct rb_node * rb_node) {
216         int i = 0;
217         if (rb_node) {
218                 i++;
219                 i += browse_rb(rb_node->rb_left);
220                 i += browse_rb(rb_node->rb_right);
221         }
222         return i;
223 }
224
225 static void validate_mm(struct mm_struct * mm) {
226         int bug = 0;
227         int i = 0;
228         struct vm_area_struct * tmp = mm->mmap;
229         while (tmp) {
230                 tmp = tmp->vm_next;
231                 i++;
232         }
233         if (i != mm->map_count)
234                 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
235         i = browse_rb(mm->mm_rb.rb_node);
236         if (i != mm->map_count)
237                 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
238         if (bug)
239                 BUG();
240 }
241 #else
242 #define validate_mm(mm) do { } while (0)
243 #endif
244
245 static struct vm_area_struct *
246 find_vma_prepare(struct mm_struct *mm, unsigned long addr,
247                 struct vm_area_struct **pprev, struct rb_node ***rb_link,
248                 struct rb_node ** rb_parent)
249 {
250         struct vm_area_struct * vma;
251         struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
252
253         __rb_link = &mm->mm_rb.rb_node;
254         rb_prev = __rb_parent = NULL;
255         vma = NULL;
256
257         while (*__rb_link) {
258                 struct vm_area_struct *vma_tmp;
259
260                 __rb_parent = *__rb_link;
261                 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
262
263                 if (vma_tmp->vm_end > addr) {
264                         vma = vma_tmp;
265                         if (vma_tmp->vm_start <= addr)
266                                 return vma;
267                         __rb_link = &__rb_parent->rb_left;
268                 } else {
269                         rb_prev = __rb_parent;
270                         __rb_link = &__rb_parent->rb_right;
271                 }
272         }
273
274         *pprev = NULL;
275         if (rb_prev)
276                 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
277         *rb_link = __rb_link;
278         *rb_parent = __rb_parent;
279         return vma;
280 }
281
282 static inline void
283 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
284                 struct vm_area_struct *prev, struct rb_node *rb_parent)
285 {
286         if (prev) {
287                 vma->vm_next = prev->vm_next;
288                 prev->vm_next = vma;
289         } else {
290                 mm->mmap = vma;
291                 if (rb_parent)
292                         vma->vm_next = rb_entry(rb_parent,
293                                         struct vm_area_struct, vm_rb);
294                 else
295                         vma->vm_next = NULL;
296         }
297 }
298
299 static void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
300                         struct rb_node **rb_link, struct rb_node *rb_parent)
301 {
302         rb_link_node(&vma->vm_rb, rb_parent, rb_link);
303         rb_insert_color(&vma->vm_rb, &mm->mm_rb);
304 }
305
306 static inline void __vma_link_file(struct vm_area_struct *vma)
307 {
308         struct file * file;
309
310         file = vma->vm_file;
311         if (file) {
312                 struct inode * inode = file->f_dentry->d_inode;
313                 struct address_space *mapping = inode->i_mapping;
314
315                 if (vma->vm_flags & VM_DENYWRITE)
316                         atomic_dec(&inode->i_writecount);
317
318                 if (vma->vm_flags & VM_SHARED)
319                         list_add_tail(&vma->shared, &mapping->i_mmap_shared);
320                 else
321                         list_add_tail(&vma->shared, &mapping->i_mmap);
322         }
323 }
324
325 static void
326 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
327         struct vm_area_struct *prev, struct rb_node **rb_link,
328         struct rb_node *rb_parent)
329 {
330         __vma_link_list(mm, vma, prev, rb_parent);
331         __vma_link_rb(mm, vma, rb_link, rb_parent);
332         __vma_link_file(vma);
333 }
334
335 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
336                         struct vm_area_struct *prev, struct rb_node **rb_link,
337                         struct rb_node *rb_parent)
338 {
339         struct address_space *mapping = NULL;
340
341         if (vma->vm_file)
342                 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
343
344         if (mapping)
345                 down(&mapping->i_shared_sem);
346         spin_lock(&mm->page_table_lock);
347         __vma_link(mm, vma, prev, rb_link, rb_parent);
348         spin_unlock(&mm->page_table_lock);
349         if (mapping)
350                 up(&mapping->i_shared_sem);
351
352         mark_mm_hugetlb(mm, vma);
353         mm->map_count++;
354         validate_mm(mm);
355 }
356
357 /*
358  * If the vma has a ->close operation then the driver probably needs to release
359  * per-vma resources, so we don't attempt to merge those.
360  */
361 #define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
362
363 static inline int is_mergeable_vma(struct vm_area_struct *vma,
364                         struct file *file, unsigned long vm_flags)
365 {
366         if (vma->vm_ops && vma->vm_ops->close)
367                 return 0;
368         if (vma->vm_file != file)
369                 return 0;
370         if (vma->vm_flags != vm_flags)
371                 return 0;
372         if (vma->vm_private_data)
373                 return 0;
374         return 1;
375 }
376
377 /*
378  * Return true if we can merge this (vm_flags,file,vm_pgoff,size)
379  * in front of (at a lower virtual address and file offset than) the vma.
380  *
381  * We don't check here for the merged mmap wrapping around the end of pagecache
382  * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
383  * wrap, nor mmaps which cover the final page at index -1UL.
384  */
385 static int
386 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
387         struct file *file, unsigned long vm_pgoff, unsigned long size)
388 {
389         if (is_mergeable_vma(vma, file, vm_flags)) {
390                 if (!file)
391                         return 1;       /* anon mapping */
392                 if (vma->vm_pgoff == vm_pgoff + size)
393                         return 1;
394         }
395         return 0;
396 }
397
398 /*
399  * Return true if we can merge this (vm_flags,file,vm_pgoff)
400  * beyond (at a higher virtual address and file offset than) the vma.
401  */
402 static int
403 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
404         struct file *file, unsigned long vm_pgoff)
405 {
406         if (is_mergeable_vma(vma, file, vm_flags)) {
407                 unsigned long vma_size;
408
409                 if (!file)
410                         return 1;       /* anon mapping */
411
412                 vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
413                 if (vma->vm_pgoff + vma_size == vm_pgoff)
414                         return 1;
415         }
416         return 0;
417 }
418
419 /*
420  * Given a new mapping request (addr,end,vm_flags,file,pgoff), figure out
421  * whether that can be merged with its predecessor or its successor.  Or
422  * both (it neatly fills a hole).
423  */
424 static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
425                         struct rb_node *rb_parent, unsigned long addr, 
426                         unsigned long end, unsigned long vm_flags,
427                         struct file *file, unsigned long pgoff)
428 {
429         spinlock_t * lock = &mm->page_table_lock;
430
431         /*
432          * We later require that vma->vm_flags == vm_flags, so this tests
433          * vma->vm_flags & VM_SPECIAL, too.
434          */
435         if (vm_flags & VM_SPECIAL)
436                 return 0;
437
438         if (!prev) {
439                 prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
440                 goto merge_next;
441         }
442
443         /*
444          * Can it merge with the predecessor?
445          */
446         if (prev->vm_end == addr &&
447                         is_mergeable_vma(prev, file, vm_flags) &&
448                         can_vma_merge_after(prev, vm_flags, file, pgoff)) {
449                 struct vm_area_struct *next;
450                 struct inode *inode = file ? file->f_dentry->d_inode : NULL;
451                 int need_up = 0;
452
453                 if (unlikely(file && prev->vm_next &&
454                                 prev->vm_next->vm_file == file)) {
455                         down(&inode->i_mapping->i_shared_sem);
456                         need_up = 1;
457                 }
458                 spin_lock(lock);
459                 prev->vm_end = end;
460
461                 /*
462                  * OK, it did.  Can we now merge in the successor as well?
463                  */
464                 next = prev->vm_next;
465                 if (next && prev->vm_end == next->vm_start &&
466                                 can_vma_merge_before(next, vm_flags, file,
467                                         pgoff, (end - addr) >> PAGE_SHIFT)) {
468                         prev->vm_end = next->vm_end;
469                         __vma_unlink(mm, next, prev);
470                         __remove_shared_vm_struct(next, inode);
471                         spin_unlock(lock);
472                         if (need_up)
473                                 up(&inode->i_mapping->i_shared_sem);
474
475                         mm->map_count--;
476                         kmem_cache_free(vm_area_cachep, next);
477                         return 1;
478                 }
479                 spin_unlock(lock);
480                 if (need_up)
481                         up(&inode->i_mapping->i_shared_sem);
482                 return 1;
483         }
484
485         /*
486          * Can this new request be merged in front of prev->vm_next?
487          */
488         prev = prev->vm_next;
489         if (prev) {
490  merge_next:
491                 if (!can_vma_merge_before(prev, vm_flags, file,
492                                 pgoff, (end - addr) >> PAGE_SHIFT))
493                         return 0;
494                 if (end == prev->vm_start) {
495                         spin_lock(lock);
496                         prev->vm_start = addr;
497                         prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
498                         spin_unlock(lock);
499                         return 1;
500                 }
501         }
502
503         return 0;
504 }
505
506 /*
507  * The caller must hold down_write(current->mm->mmap_sem).
508  */
509
510 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
511                         unsigned long len, unsigned long prot,
512                         unsigned long flags, unsigned long pgoff)
513 {
514         struct mm_struct * mm = current->mm;
515         struct vm_area_struct * vma, * prev;
516         struct inode *inode;
517         unsigned int vm_flags;
518         int correct_wcount = 0;
519         int error;
520         struct rb_node ** rb_link, * rb_parent;
521         unsigned long charged = 0;
522
523         if (file && (!file->f_op || !file->f_op->mmap))
524                 return -ENODEV;
525
526         if (!len)
527                 return addr;
528
529         if (len > TASK_SIZE)
530                 return -EINVAL;
531
532         len = PAGE_ALIGN(len);
533
534         /* offset overflow? */
535         if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
536                 return -EINVAL;
537
538         /* Too many mappings? */
539         if (mm->map_count > MAX_MAP_COUNT)
540                 return -ENOMEM;
541
542         /* Obtain the address to map to. we verify (or select) it and ensure
543          * that it represents a valid section of the address space.
544          */
545         addr = get_unmapped_area(file, addr, len, pgoff, flags);
546         if (addr & ~PAGE_MASK)
547                 return addr;
548
549         /* Do simple checking here so the lower-level routines won't have
550          * to. we assume access permissions have been handled by the open
551          * of the memory object, so we don't do any here.
552          */
553         vm_flags = calc_vm_flags(prot,flags) | mm->def_flags |
554                         VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
555
556         if (flags & MAP_LOCKED) {
557                 if (!capable(CAP_IPC_LOCK))
558                         return -EPERM;
559                 vm_flags |= VM_LOCKED;
560         }
561         /* mlock MCL_FUTURE? */
562         if (vm_flags & VM_LOCKED) {
563                 unsigned long locked = mm->locked_vm << PAGE_SHIFT;
564                 locked += len;
565                 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
566                         return -EAGAIN;
567         }
568
569         inode = file ? file->f_dentry->d_inode : NULL;
570
571         if (file) {
572                 switch (flags & MAP_TYPE) {
573                 case MAP_SHARED:
574                         if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
575                                 return -EACCES;
576
577                         /*
578                          * Make sure we don't allow writing to an append-only
579                          * file..
580                          */
581                         if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
582                                 return -EACCES;
583
584                         /*
585                          * Make sure there are no mandatory locks on the file.
586                          */
587                         if (locks_verify_locked(inode))
588                                 return -EAGAIN;
589
590                         vm_flags |= VM_SHARED | VM_MAYSHARE;
591                         if (!(file->f_mode & FMODE_WRITE))
592                                 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
593
594                         /* fall through */
595                 case MAP_PRIVATE:
596                         if (!(file->f_mode & FMODE_READ))
597                                 return -EACCES;
598                         break;
599
600                 default:
601                         return -EINVAL;
602                 }
603         } else {
604                 vm_flags |= VM_SHARED | VM_MAYSHARE;
605                 switch (flags & MAP_TYPE) {
606                 default:
607                         return -EINVAL;
608                 case MAP_PRIVATE:
609                         vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
610                         /* fall through */
611                 case MAP_SHARED:
612                         break;
613                 }
614         }
615
616         error = security_file_mmap(file, prot, flags);
617         if (error)
618                 return error;
619                 
620         /* Clear old maps */
621         error = -ENOMEM;
622 munmap_back:
623         vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
624         if (vma && vma->vm_start < addr + len) {
625                 if (do_munmap(mm, addr, len))
626                         return -ENOMEM;
627                 goto munmap_back;
628         }
629
630         /* Check against address space limit. */
631         if ((mm->total_vm << PAGE_SHIFT) + len
632             > current->rlim[RLIMIT_AS].rlim_cur)
633                 return -ENOMEM;
634
635         if (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory > 1) {
636                 if (vm_flags & VM_SHARED) {
637                         /* Check memory availability in shmem_file_setup? */
638                         vm_flags |= VM_ACCOUNT;
639                 } else if (vm_flags & VM_WRITE) {
640                         /*
641                          * Private writable mapping: check memory availability
642                          */
643                         charged = len >> PAGE_SHIFT;
644                         if (!vm_enough_memory(charged))
645                                 return -ENOMEM;
646                         vm_flags |= VM_ACCOUNT;
647                 }
648         }
649
650         /* Can we just expand an old anonymous mapping? */
651         if (!file && !(vm_flags & VM_SHARED) && rb_parent)
652                 if (vma_merge(mm, prev, rb_parent, addr, addr + len,
653                                         vm_flags, NULL, 0))
654                         goto out;
655
656         /*
657          * Determine the object being mapped and call the appropriate
658          * specific mapper. the address has already been validated, but
659          * not unmapped, but the maps are removed from the list.
660          */
661         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
662         error = -ENOMEM;
663         if (!vma)
664                 goto unacct_error;
665
666         vma->vm_mm = mm;
667         vma->vm_start = addr;
668         vma->vm_end = addr + len;
669         vma->vm_flags = vm_flags;
670         vma->vm_page_prot = protection_map[vm_flags & 0x0f];
671         vma->vm_ops = NULL;
672         vma->vm_pgoff = pgoff;
673         vma->vm_file = NULL;
674         vma->vm_private_data = NULL;
675         INIT_LIST_HEAD(&vma->shared);
676
677         if (file) {
678                 error = -EINVAL;
679                 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
680                         goto free_vma;
681                 if (vm_flags & VM_DENYWRITE) {
682                         error = deny_write_access(file);
683                         if (error)
684                                 goto free_vma;
685                         correct_wcount = 1;
686                 }
687                 vma->vm_file = file;
688                 get_file(file);
689                 error = file->f_op->mmap(file, vma);
690                 if (error)
691                         goto unmap_and_free_vma;
692         } else if (vm_flags & VM_SHARED) {
693                 error = shmem_zero_setup(vma);
694                 if (error)
695                         goto free_vma;
696         }
697
698         /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
699          * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
700          * that memory reservation must be checked; but that reservation
701          * belongs to shared memory object, not to vma: so now clear it.
702          */
703         if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
704                 vma->vm_flags &= ~VM_ACCOUNT;
705
706         /* Can addr have changed??
707          *
708          * Answer: Yes, several device drivers can do it in their
709          *         f_op->mmap method. -DaveM
710          */
711         addr = vma->vm_start;
712
713         if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr,
714                                 addr + len, vma->vm_flags, file, pgoff)) {
715                 vma_link(mm, vma, prev, rb_link, rb_parent);
716                 if (correct_wcount)
717                         atomic_inc(&inode->i_writecount);
718         } else {
719                 if (file) {
720                         if (correct_wcount)
721                                 atomic_inc(&inode->i_writecount);
722                         fput(file);
723                 }
724                 kmem_cache_free(vm_area_cachep, vma);
725         }
726 out:    
727         mm->total_vm += len >> PAGE_SHIFT;
728         if (vm_flags & VM_LOCKED) {
729                 mm->locked_vm += len >> PAGE_SHIFT;
730                 make_pages_present(addr, addr + len);
731         }
732         if (flags & MAP_POPULATE) {
733                 up_write(&mm->mmap_sem);
734                 sys_remap_file_pages(addr, len, prot,
735                                         pgoff, flags & MAP_NONBLOCK);
736                 down_write(&mm->mmap_sem);
737         }
738         return addr;
739
740 unmap_and_free_vma:
741         if (correct_wcount)
742                 atomic_inc(&inode->i_writecount);
743         vma->vm_file = NULL;
744         fput(file);
745
746         /* Undo any partial mapping done by a device driver. */
747         zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
748 free_vma:
749         kmem_cache_free(vm_area_cachep, vma);
750 unacct_error:
751         if (charged)
752                 vm_unacct_memory(charged);
753         return error;
754 }
755
756 /* Get an address range which is currently unmapped.
757  * For shmat() with addr=0.
758  *
759  * Ugly calling convention alert:
760  * Return value with the low bits set means error value,
761  * ie
762  *      if (ret & ~PAGE_MASK)
763  *              error = ret;
764  *
765  * This function "knows" that -ENOMEM has the bits set.
766  */
767 #ifndef HAVE_ARCH_UNMAPPED_AREA
768 static inline unsigned long
769 arch_get_unmapped_area(struct file *filp, unsigned long addr,
770                 unsigned long len, unsigned long pgoff, unsigned long flags)
771 {
772         struct mm_struct *mm = current->mm;
773         struct vm_area_struct *vma;
774         int found_hole = 0;
775
776         if (len > TASK_SIZE)
777                 return -ENOMEM;
778
779         if (addr) {
780                 addr = PAGE_ALIGN(addr);
781                 vma = find_vma(mm, addr);
782                 if (TASK_SIZE - len >= addr &&
783                     (!vma || addr + len <= vma->vm_start))
784                         return addr;
785         }
786         addr = mm->free_area_cache;
787
788         for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
789                 /* At this point:  (!vma || addr < vma->vm_end). */
790                 if (TASK_SIZE - len < addr)
791                         return -ENOMEM;
792                 /*
793                  * Record the first available hole.
794                  */
795                 if (!found_hole && (!vma || addr < vma->vm_start)) {
796                         mm->free_area_cache = addr;
797                         found_hole = 1;
798                 }
799                 if (!vma || addr + len <= vma->vm_start)
800                         return addr;
801                 addr = vma->vm_end;
802         }
803 }
804 #else
805 extern unsigned long
806 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
807                         unsigned long, unsigned long);
808 #endif  
809
810 unsigned long
811 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
812                 unsigned long pgoff, unsigned long flags)
813 {
814         if (flags & MAP_FIXED) {
815                 unsigned long ret;
816
817                 if (addr > TASK_SIZE - len)
818                         return -ENOMEM;
819                 if (addr & ~PAGE_MASK)
820                         return -EINVAL;
821                 if (file && is_file_hugepages(file))  {
822                         /*
823                          * Make sure that addr and length are properly aligned.
824                          */
825                         ret = is_aligned_hugepage_range(addr, len);
826                 } else {
827                         /*
828                          * Ensure that a normal request is not falling in a
829                          * reserved hugepage range.  For some archs like IA-64,
830                          * there is a separate region for hugepages.
831                          */
832                         ret = check_valid_hugepage_range(addr, len);
833                 }
834                 if (ret)
835                         return ret;
836                 return addr;
837         }
838
839         if (file && file->f_op && file->f_op->get_unmapped_area)
840                 return file->f_op->get_unmapped_area(file, addr, len,
841                                                 pgoff, flags);
842
843         return arch_get_unmapped_area(file, addr, len, pgoff, flags);
844 }
845
846 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
847 struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
848 {
849         struct vm_area_struct *vma = NULL;
850
851         if (mm) {
852                 /* Check the cache first. */
853                 /* (Cache hit rate is typically around 35%.) */
854                 vma = mm->mmap_cache;
855                 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
856                         struct rb_node * rb_node;
857
858                         rb_node = mm->mm_rb.rb_node;
859                         vma = NULL;
860
861                         while (rb_node) {
862                                 struct vm_area_struct * vma_tmp;
863
864                                 vma_tmp = rb_entry(rb_node,
865                                                 struct vm_area_struct, vm_rb);
866
867                                 if (vma_tmp->vm_end > addr) {
868                                         vma = vma_tmp;
869                                         if (vma_tmp->vm_start <= addr)
870                                                 break;
871                                         rb_node = rb_node->rb_left;
872                                 } else
873                                         rb_node = rb_node->rb_right;
874                         }
875                         if (vma)
876                                 mm->mmap_cache = vma;
877                 }
878         }
879         return vma;
880 }
881
882 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
883 struct vm_area_struct *
884 find_vma_prev(struct mm_struct *mm, unsigned long addr,
885                         struct vm_area_struct **pprev)
886 {
887         struct vm_area_struct *vma = NULL, *prev = NULL;
888         struct rb_node * rb_node;
889         if (!mm)
890                 goto out;
891
892         /* Guard against addr being lower than the first VMA */
893         vma = mm->mmap;
894
895         /* Go through the RB tree quickly. */
896         rb_node = mm->mm_rb.rb_node;
897
898         while (rb_node) {
899                 struct vm_area_struct *vma_tmp;
900                 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
901
902                 if (addr < vma_tmp->vm_end) {
903                         rb_node = rb_node->rb_left;
904                 } else {
905                         prev = vma_tmp;
906                         if (!prev->vm_next || (addr < prev->vm_next->vm_end))
907                                 break;
908                         rb_node = rb_node->rb_right;
909                 }
910         }
911
912 out:
913         *pprev = prev;
914         return prev ? prev->vm_next : vma;
915 }
916
917 #ifdef CONFIG_STACK_GROWSUP
918 /*
919  * vma is the first one with address > vma->vm_end.  Have to extend vma.
920  */
921 int expand_stack(struct vm_area_struct * vma, unsigned long address)
922 {
923         unsigned long grow;
924
925         if (!(vma->vm_flags & VM_GROWSUP))
926                 return -EFAULT;
927
928         /*
929          * vma->vm_start/vm_end cannot change under us because the caller
930          * is required to hold the mmap_sem in read mode. We need to get
931          * the spinlock only before relocating the vma range ourself.
932          */
933         address += 4 + PAGE_SIZE - 1;
934         address &= PAGE_MASK;
935         spin_lock(&vma->vm_mm->page_table_lock);
936         grow = (address - vma->vm_end) >> PAGE_SHIFT;
937
938         /* Overcommit.. */
939         if (!vm_enough_memory(grow)) {
940                 spin_unlock(&vma->vm_mm->page_table_lock);
941                 return -ENOMEM;
942         }
943         
944         if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
945                         ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
946                         current->rlim[RLIMIT_AS].rlim_cur) {
947                 spin_unlock(&vma->vm_mm->page_table_lock);
948                 vm_unacct_memory(grow);
949                 return -ENOMEM;
950         }
951         vma->vm_end = address;
952         vma->vm_mm->total_vm += grow;
953         if (vma->vm_flags & VM_LOCKED)
954                 vma->vm_mm->locked_vm += grow;
955         spin_unlock(&vma->vm_mm->page_table_lock);
956         return 0;
957 }
958
959 struct vm_area_struct *
960 find_extend_vma(struct mm_struct *mm, unsigned long addr)
961 {
962         struct vm_area_struct *vma, *prev;
963
964         addr &= PAGE_MASK;
965         vma = find_vma_prev(mm, addr, &prev);
966         if (vma && (vma->vm_start <= addr))
967                 return vma;
968         if (!prev || expand_stack(prev, addr))
969                 return NULL;
970         if (prev->vm_flags & VM_LOCKED) {
971                 make_pages_present(addr, prev->vm_end);
972         }
973         return prev;
974 }
975 #else
976 /*
977  * vma is the first one with address < vma->vm_start.  Have to extend vma.
978  */
979 int expand_stack(struct vm_area_struct *vma, unsigned long address)
980 {
981         unsigned long grow;
982
983         /*
984          * vma->vm_start/vm_end cannot change under us because the caller
985          * is required to hold the mmap_sem in read mode. We need to get
986          * the spinlock only before relocating the vma range ourself.
987          */
988         address &= PAGE_MASK;
989         spin_lock(&vma->vm_mm->page_table_lock);
990         grow = (vma->vm_start - address) >> PAGE_SHIFT;
991
992         /* Overcommit.. */
993         if (!vm_enough_memory(grow)) {
994                 spin_unlock(&vma->vm_mm->page_table_lock);
995                 return -ENOMEM;
996         }
997         
998         if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
999                         ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
1000                         current->rlim[RLIMIT_AS].rlim_cur) {
1001                 spin_unlock(&vma->vm_mm->page_table_lock);
1002                 vm_unacct_memory(grow);
1003                 return -ENOMEM;
1004         }
1005         vma->vm_start = address;
1006         vma->vm_pgoff -= grow;
1007         vma->vm_mm->total_vm += grow;
1008         if (vma->vm_flags & VM_LOCKED)
1009                 vma->vm_mm->locked_vm += grow;
1010         spin_unlock(&vma->vm_mm->page_table_lock);
1011         return 0;
1012 }
1013
1014 struct vm_area_struct *
1015 find_extend_vma(struct mm_struct * mm, unsigned long addr)
1016 {
1017         struct vm_area_struct * vma;
1018         unsigned long start;
1019
1020         addr &= PAGE_MASK;
1021         vma = find_vma(mm,addr);
1022         if (!vma)
1023                 return NULL;
1024         if (vma->vm_start <= addr)
1025                 return vma;
1026         if (!(vma->vm_flags & VM_GROWSDOWN))
1027                 return NULL;
1028         start = vma->vm_start;
1029         if (expand_stack(vma, addr))
1030                 return NULL;
1031         if (vma->vm_flags & VM_LOCKED) {
1032                 make_pages_present(addr, start);
1033         }
1034         return vma;
1035 }
1036 #endif
1037
1038 /*
1039  * Try to free as many page directory entries as we can,
1040  * without having to work very hard at actually scanning
1041  * the page tables themselves.
1042  *
1043  * Right now we try to free page tables if we have a nice
1044  * PGDIR-aligned area that got free'd up. We could be more
1045  * granular if we want to, but this is fast and simple,
1046  * and covers the bad cases.
1047  *
1048  * "prev", if it exists, points to a vma before the one
1049  * we just free'd - but there's no telling how much before.
1050  */
1051 static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
1052         unsigned long start, unsigned long end)
1053 {
1054         unsigned long first = start & PGDIR_MASK;
1055         unsigned long last = end + PGDIR_SIZE - 1;
1056         unsigned long start_index, end_index;
1057         struct mm_struct *mm = tlb->mm;
1058
1059         if (!prev) {
1060                 prev = mm->mmap;
1061                 if (!prev)
1062                         goto no_mmaps;
1063                 if (prev->vm_end > start) {
1064                         if (last > prev->vm_start)
1065                                 last = prev->vm_start;
1066                         goto no_mmaps;
1067                 }
1068         }
1069         for (;;) {
1070                 struct vm_area_struct *next = prev->vm_next;
1071
1072                 if (next) {
1073                         if (next->vm_start < start) {
1074                                 prev = next;
1075                                 continue;
1076                         }
1077                         if (last > next->vm_start)
1078                                 last = next->vm_start;
1079                 }
1080                 if (prev->vm_end > first)
1081                         first = prev->vm_end + PGDIR_SIZE - 1;
1082                 break;
1083         }
1084 no_mmaps:
1085         if (last < first)       /* for arches with discontiguous pgd indices */
1086                 return;
1087         /*
1088          * If the PGD bits are not consecutive in the virtual address, the
1089          * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
1090          */
1091         start_index = pgd_index(first);
1092         if (start_index < FIRST_USER_PGD_NR)
1093                 start_index = FIRST_USER_PGD_NR;
1094         end_index = pgd_index(last);
1095         if (end_index > start_index) {
1096                 clear_page_tables(tlb, start_index, end_index - start_index);
1097                 flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
1098         }
1099 }
1100
1101 /* Normal function to fix up a mapping
1102  * This function is the default for when an area has no specific
1103  * function.  This may be used as part of a more specific routine.
1104  *
1105  * By the time this function is called, the area struct has been
1106  * removed from the process mapping list.
1107  */
1108 static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1109 {
1110         size_t len = area->vm_end - area->vm_start;
1111
1112         area->vm_mm->total_vm -= len >> PAGE_SHIFT;
1113         if (area->vm_flags & VM_LOCKED)
1114                 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1115         /*
1116          * Is this a new hole at the lowest possible address?
1117          */
1118         if (area->vm_start >= TASK_UNMAPPED_BASE &&
1119                                 area->vm_start < area->vm_mm->free_area_cache)
1120               area->vm_mm->free_area_cache = area->vm_start;
1121
1122         remove_shared_vm_struct(area);
1123
1124         if (area->vm_ops && area->vm_ops->close)
1125                 area->vm_ops->close(area);
1126         if (area->vm_file)
1127                 fput(area->vm_file);
1128         kmem_cache_free(vm_area_cachep, area);
1129 }
1130
1131 /*
1132  * Update the VMA and inode share lists.
1133  *
1134  * Ok - we have the memory areas we should free on the 'free' list,
1135  * so release them, and do the vma updates.
1136  */
1137 static void unmap_vma_list(struct mm_struct *mm,
1138         struct vm_area_struct *mpnt)
1139 {
1140         do {
1141                 struct vm_area_struct *next = mpnt->vm_next;
1142                 unmap_vma(mm, mpnt);
1143                 mpnt = next;
1144         } while (mpnt != NULL);
1145         validate_mm(mm);
1146 }
1147
1148 /*
1149  * Get rid of page table information in the indicated region.
1150  *
1151  * Called with the page table lock held.
1152  */
1153 static void unmap_region(struct mm_struct *mm,
1154         struct vm_area_struct *vma,
1155         struct vm_area_struct *prev,
1156         unsigned long start,
1157         unsigned long end)
1158 {
1159         struct mmu_gather *tlb;
1160         unsigned long nr_accounted = 0;
1161
1162         lru_add_drain();
1163         tlb = tlb_gather_mmu(mm, 0);
1164         unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted);
1165         vm_unacct_memory(nr_accounted);
1166         free_pgtables(tlb, prev, start, end);
1167         tlb_finish_mmu(tlb, start, end);
1168 }
1169
1170 /*
1171  * Create a list of vma's touched by the unmap, removing them from the mm's
1172  * vma list as we go..
1173  *
1174  * Called with the page_table_lock held.
1175  */
1176 static void
1177 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1178         struct vm_area_struct *prev, unsigned long end)
1179 {
1180         struct vm_area_struct **insertion_point;
1181         struct vm_area_struct *tail_vma = NULL;
1182
1183         insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1184         do {
1185                 rb_erase(&vma->vm_rb, &mm->mm_rb);
1186                 mm->map_count--;
1187                 tail_vma = vma;
1188                 vma = vma->vm_next;
1189         } while (vma && vma->vm_start < end);
1190         *insertion_point = vma;
1191         tail_vma->vm_next = NULL;
1192         mm->mmap_cache = NULL;          /* Kill the cache. */
1193 }
1194
1195 /*
1196  * Split a vma into two pieces at address 'addr', a new vma is allocated
1197  * either for the first part or the the tail.
1198  */
1199 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1200               unsigned long addr, int new_below)
1201 {
1202         struct vm_area_struct *new;
1203
1204         if (mm->map_count >= MAX_MAP_COUNT)
1205                 return -ENOMEM;
1206
1207         new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1208         if (!new)
1209                 return -ENOMEM;
1210
1211         /* most fields are the same, copy all, and then fixup */
1212         *new = *vma;
1213
1214         INIT_LIST_HEAD(&new->shared);
1215
1216         if (new_below) {
1217                 new->vm_end = addr;
1218                 vma->vm_start = addr;
1219                 vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
1220         } else {
1221                 vma->vm_end = addr;
1222                 new->vm_start = addr;
1223                 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
1224         }
1225
1226         if (new->vm_file)
1227                 get_file(new->vm_file);
1228
1229         if (new->vm_ops && new->vm_ops->open)
1230                 new->vm_ops->open(new);
1231
1232         insert_vm_struct(mm, new);
1233         return 0;
1234 }
1235
1236 /* Munmap is split into 2 main parts -- this part which finds
1237  * what needs doing, and the areas themselves, which do the
1238  * work.  This now handles partial unmappings.
1239  * Jeremy Fitzhardinge <jeremy@goop.org>
1240  */
1241 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1242 {
1243         unsigned long end;
1244         struct vm_area_struct *mpnt, *prev, *last;
1245
1246         if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
1247                 return -EINVAL;
1248
1249         if ((len = PAGE_ALIGN(len)) == 0)
1250                 return -EINVAL;
1251
1252         /* Find the first overlapping VMA */
1253         mpnt = find_vma_prev(mm, start, &prev);
1254         if (!mpnt)
1255                 return 0;
1256         /* we have  start < mpnt->vm_end  */
1257
1258         if (is_vm_hugetlb_page(mpnt)) {
1259                 int ret = is_aligned_hugepage_range(start, len);
1260
1261                 if (ret)
1262                         return ret;
1263         }
1264
1265         /* if it doesn't overlap, we have nothing.. */
1266         end = start + len;
1267         if (mpnt->vm_start >= end)
1268                 return 0;
1269
1270         /* Something will probably happen, so notify. */
1271         if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC))
1272                 profile_exec_unmap(mm);
1273  
1274         /*
1275          * If we need to split any vma, do it now to save pain later.
1276          *
1277          * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
1278          * unmapped vm_area_struct will remain in use: so lower split_vma
1279          * places tmp vma above, and higher split_vma places tmp vma below.
1280          */
1281         if (start > mpnt->vm_start) {
1282                 if (split_vma(mm, mpnt, start, 0))
1283                         return -ENOMEM;
1284                 prev = mpnt;
1285         }
1286
1287         /* Does it split the last one? */
1288         last = find_vma(mm, end);
1289         if (last && end > last->vm_start) {
1290                 if (split_vma(mm, last, end, 1))
1291                         return -ENOMEM;
1292         }
1293         mpnt = prev? prev->vm_next: mm->mmap;
1294
1295         /*
1296          * Remove the vma's, and unmap the actual pages
1297          */
1298         spin_lock(&mm->page_table_lock);
1299         detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1300         unmap_region(mm, mpnt, prev, start, end);
1301         spin_unlock(&mm->page_table_lock);
1302
1303         /* Fix up all other VM information */
1304         unmap_vma_list(mm, mpnt);
1305
1306         return 0;
1307 }
1308
1309 asmlinkage long sys_munmap(unsigned long addr, size_t len)
1310 {
1311         int ret;
1312         struct mm_struct *mm = current->mm;
1313
1314         down_write(&mm->mmap_sem);
1315         ret = do_munmap(mm, addr, len);
1316         up_write(&mm->mmap_sem);
1317         return ret;
1318 }
1319
1320 /*
1321  *  this is really a simplified "do_mmap".  it only handles
1322  *  anonymous maps.  eventually we may be able to do some
1323  *  brk-specific accounting here.
1324  */
1325 unsigned long do_brk(unsigned long addr, unsigned long len)
1326 {
1327         struct mm_struct * mm = current->mm;
1328         struct vm_area_struct * vma, * prev;
1329         unsigned long flags;
1330         struct rb_node ** rb_link, * rb_parent;
1331
1332         len = PAGE_ALIGN(len);
1333         if (!len)
1334                 return addr;
1335
1336         /*
1337          * mlock MCL_FUTURE?
1338          */
1339         if (mm->def_flags & VM_LOCKED) {
1340                 unsigned long locked = mm->locked_vm << PAGE_SHIFT;
1341                 locked += len;
1342                 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
1343                         return -EAGAIN;
1344         }
1345
1346         /*
1347          * Clear old maps.  this also does some error checking for us
1348          */
1349  munmap_back:
1350         vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1351         if (vma && vma->vm_start < addr + len) {
1352                 if (do_munmap(mm, addr, len))
1353                         return -ENOMEM;
1354                 goto munmap_back;
1355         }
1356
1357         /* Check against address space limits *after* clearing old maps... */
1358         if ((mm->total_vm << PAGE_SHIFT) + len
1359             > current->rlim[RLIMIT_AS].rlim_cur)
1360                 return -ENOMEM;
1361
1362         if (mm->map_count > MAX_MAP_COUNT)
1363                 return -ENOMEM;
1364
1365         if (!vm_enough_memory(len >> PAGE_SHIFT))
1366                 return -ENOMEM;
1367
1368         flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1369
1370         /* Can we just expand an old anonymous mapping? */
1371         if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len,
1372                                         flags, NULL, 0))
1373                 goto out;
1374
1375         /*
1376          * create a vma struct for an anonymous mapping
1377          */
1378         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1379         if (!vma) {
1380                 vm_unacct_memory(len >> PAGE_SHIFT);
1381                 return -ENOMEM;
1382         }
1383
1384         vma->vm_mm = mm;
1385         vma->vm_start = addr;
1386         vma->vm_end = addr + len;
1387         vma->vm_flags = flags;
1388         vma->vm_page_prot = protection_map[flags & 0x0f];
1389         vma->vm_ops = NULL;
1390         vma->vm_pgoff = 0;
1391         vma->vm_file = NULL;
1392         vma->vm_private_data = NULL;
1393         INIT_LIST_HEAD(&vma->shared);
1394
1395         vma_link(mm, vma, prev, rb_link, rb_parent);
1396
1397 out:
1398         mm->total_vm += len >> PAGE_SHIFT;
1399         if (flags & VM_LOCKED) {
1400                 mm->locked_vm += len >> PAGE_SHIFT;
1401                 make_pages_present(addr, addr + len);
1402         }
1403         return addr;
1404 }
1405
1406 /* Build the RB tree corresponding to the VMA list. */
1407 void build_mmap_rb(struct mm_struct * mm)
1408 {
1409         struct vm_area_struct * vma;
1410         struct rb_node ** rb_link, * rb_parent;
1411
1412         mm->mm_rb = RB_ROOT;
1413         rb_link = &mm->mm_rb.rb_node;
1414         rb_parent = NULL;
1415         for (vma = mm->mmap; vma; vma = vma->vm_next) {
1416                 __vma_link_rb(mm, vma, rb_link, rb_parent);
1417                 rb_parent = &vma->vm_rb;
1418                 rb_link = &rb_parent->rb_right;
1419         }
1420 }
1421
1422 /* Release all mmaps. */
1423 void exit_mmap(struct mm_struct *mm)
1424 {
1425         struct mmu_gather *tlb;
1426         struct vm_area_struct *vma;
1427         unsigned long nr_accounted = 0;
1428
1429         profile_exit_mmap(mm);
1430  
1431         lru_add_drain();
1432
1433         spin_lock(&mm->page_table_lock);
1434
1435         tlb = tlb_gather_mmu(mm, 1);
1436         flush_cache_mm(mm);
1437         /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
1438         mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
1439                                         ~0UL, &nr_accounted);
1440         vm_unacct_memory(nr_accounted);
1441         BUG_ON(mm->map_count);  /* This is just debugging */
1442         clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
1443         tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1444
1445         vma = mm->mmap;
1446         mm->mmap = mm->mmap_cache = NULL;
1447         mm->mm_rb = RB_ROOT;
1448         mm->rss = 0;
1449         mm->total_vm = 0;
1450         mm->locked_vm = 0;
1451
1452         spin_unlock(&mm->page_table_lock);
1453
1454         /*
1455          * Walk the list again, actually closing and freeing it
1456          * without holding any MM locks.
1457          */
1458         while (vma) {
1459                 struct vm_area_struct *next = vma->vm_next;
1460                 remove_shared_vm_struct(vma);
1461                 if (vma->vm_ops) {
1462                         if (vma->vm_ops->close)
1463                                 vma->vm_ops->close(vma);
1464                 }
1465                 if (vma->vm_file)
1466                         fput(vma->vm_file);
1467                 kmem_cache_free(vm_area_cachep, vma);
1468                 vma = next;
1469         }
1470 }
1471
1472 /* Insert vm structure into process list sorted by address
1473  * and into the inode's i_mmap ring.  If vm_file is non-NULL
1474  * then i_shared_sem is taken here.
1475  */
1476 void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
1477 {
1478         struct vm_area_struct * __vma, * prev;
1479         struct rb_node ** rb_link, * rb_parent;
1480
1481         __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
1482         if (__vma && __vma->vm_start < vma->vm_end)
1483                 BUG();
1484         vma_link(mm, vma, prev, rb_link, rb_parent);
1485         validate_mm(mm);
1486 }