mm/mmap.c

   1 /*
   2  * mm/mmap.c
   3  *
   4  * Written by obz.
   5  *
   6  * Address space accounting code        <alan@redhat.com>
   7  */
   8
   9 #include <linux/slab.h>
  10 #include <linux/shm.h>
  11 #include <linux/mman.h>
  12 #include <linux/pagemap.h>
  13 #include <linux/swap.h>
  14 #include <linux/init.h>
  15 #include <linux/file.h>
  16 #include <linux/fs.h>
  17 #include <linux/personality.h>
  18 #include <linux/security.h>
  19 #include <linux/hugetlb.h>
  20 #include <linux/profile.h>
  21
  22 #include <asm/uaccess.h>
  23 #include <asm/pgalloc.h>
  24 #include <asm/tlb.h>
  25
  26 /*
  27  * WARNING: the debugging will use recursive algorithms so never enable this
  28  * unless you know what you are doing.
  29  */
  30 #undef DEBUG_MM_RB
  31
  32 /* description of effects of mapping type and prot in current implementation.
  33  * this is due to the limited x86 page protection hardware.  The expected
  34  * behavior is in parens:
  35  *
  36  * map_type     prot
  37  *              PROT_NONE       PROT_READ       PROT_WRITE      PROT_EXEC
  38  * MAP_SHARED   r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
  39  *              w: (no) no      w: (no) no      w: (yes) yes    w: (no) no
  40  *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  41  *
  42  * MAP_PRIVATE  r: (no) no      r: (yes) yes    r: (no) yes     r: (no) yes
  43  *              w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
  44  *              x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  45  *
  46  */
  47 pgprot_t protection_map[16] = {
  48         __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
  49         __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
  50 };
  51
  52 int sysctl_overcommit_memory = 0;       /* default is heuristic overcommit */
  53 int sysctl_overcommit_ratio = 50;       /* default is 50% */
  54 atomic_t vm_committed_space = ATOMIC_INIT(0);
  55
  56 /*
  57  * Check that a process has enough memory to allocate a new virtual
  58  * mapping. 1 means there is enough memory for the allocation to
  59  * succeed and 0 implies there is not.
  60  *
  61  * We currently support three overcommit policies, which are set via the
  62  * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-acounting
  63  *
  64  * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
  65  * Additional code 2002 Jul 20 by Robert Love.
  66  */
  67 int vm_enough_memory(long pages)
  68 {
  69         unsigned long free, allowed;
  70
  71         vm_acct_memory(pages);
  72
  73         /*
  74          * Sometimes we want to use more memory than we have
  75          */
  76         if (sysctl_overcommit_memory == 1)
  77                 return 1;
  78
  79         if (sysctl_overcommit_memory == 0) {
  80                 free = get_page_cache_size();
  81                 free += nr_free_pages();
  82                 free += nr_swap_pages;
  83
  84                 /*
  85                  * The code below doesn't account for free space in the
  86                  * inode and dentry slab cache, slab cache fragmentation,
  87                  * inodes and dentries which will become freeable under
  88                  * VM load, etc. Lets just hope all these (complex)
  89                  * factors balance out...
  90                  */
  91                 free += (dentry_stat.nr_unused * sizeof(struct dentry)) >>
  92                         PAGE_SHIFT;
  93                 free += (inodes_stat.nr_unused * sizeof(struct inode)) >>
  94                         PAGE_SHIFT;
  95
  96                 if (free > pages)
  97                         return 1;
  98                 vm_unacct_memory(pages);
  99                 return 0;
 100         }
 101
 102         allowed = totalram_pages * sysctl_overcommit_ratio / 100;
 103         allowed += total_swap_pages;
 104
 105         if (atomic_read(&vm_committed_space) < allowed)
 106                 return 1;
 107
 108         vm_unacct_memory(pages);
 109
 110         return 0;
 111 }
 112
 113 /*
 114  * Requires inode->i_mapping->i_shared_sem
 115  */
 116 static inline void
 117 __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode)
 118 {
 119         if (inode) {
 120                 if (vma->vm_flags & VM_DENYWRITE)
 121                         atomic_inc(&inode->i_writecount);
 122                 list_del_init(&vma->shared);
 123         }
 124 }
 125
 126 /*
 127  * Remove one vm structure from the inode's i_mapping address space.
 128  */
 129 static void remove_shared_vm_struct(struct vm_area_struct *vma)
 130 {
 131         struct file *file = vma->vm_file;
 132
 133         if (file) {
 134                 struct inode *inode = file->f_dentry->d_inode;
 135
 136                 down(&inode->i_mapping->i_shared_sem);
 137                 __remove_shared_vm_struct(vma, inode);
 138                 up(&inode->i_mapping->i_shared_sem);
 139         }
 140 }
 141
 142 /*
 143  *  sys_brk() for the most part doesn't need the global kernel
 144  *  lock, except when an application is doing something nasty
 145  *  like trying to un-brk an area that has already been mapped
 146  *  to a regular file.  in this case, the unmapping will need
 147  *  to invoke file system routines that need the global lock.
 148  */
 149 asmlinkage unsigned long sys_brk(unsigned long brk)
 150 {
 151         unsigned long rlim, retval;
 152         unsigned long newbrk, oldbrk;
 153         struct mm_struct *mm = current->mm;
 154
 155         down_write(&mm->mmap_sem);
 156
 157         if (brk < mm->end_code)
 158                 goto out;
 159         newbrk = PAGE_ALIGN(brk);
 160         oldbrk = PAGE_ALIGN(mm->brk);
 161         if (oldbrk == newbrk)
 162                 goto set_brk;
 163
 164         /* Always allow shrinking brk. */
 165         if (brk <= mm->brk) {
 166                 if (!do_munmap(mm, newbrk, oldbrk-newbrk))
 167                         goto set_brk;
 168                 goto out;
 169         }
 170
 171         /* Check against rlimit.. */
 172         rlim = current->rlim[RLIMIT_DATA].rlim_cur;
 173         if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
 174                 goto out;
 175
 176         /* Check against existing mmap mappings. */
 177         if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
 178                 goto out;
 179
 180         /* Ok, looks good - let it rip. */
 181         if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
 182                 goto out;
 183 set_brk:
 184         mm->brk = brk;
 185 out:
 186         retval = mm->brk;
 187         up_write(&mm->mmap_sem);
 188         return retval;
 189 }
 190
 191 /* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
 192  * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
 193  * into "VM_xxx".
 194  */
 195 static inline unsigned long
 196 calc_vm_flags(unsigned long prot, unsigned long flags)
 197 {
 198 #define _trans(x,bit1,bit2) \
 199 ((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
 200
 201         unsigned long prot_bits, flag_bits;
 202         prot_bits =
 203                 _trans(prot, PROT_READ, VM_READ) |
 204                 _trans(prot, PROT_WRITE, VM_WRITE) |
 205                 _trans(prot, PROT_EXEC, VM_EXEC);
 206         flag_bits =
 207                 _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
 208                 _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
 209                 _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
 210         return prot_bits | flag_bits;
 211 #undef _trans
 212 }
 213
 214 #ifdef DEBUG_MM_RB
 215 static int browse_rb(struct rb_node * rb_node) {
 216         int i = 0;
 217         if (rb_node) {
 218                 i++;
 219                 i += browse_rb(rb_node->rb_left);
 220                 i += browse_rb(rb_node->rb_right);
 221         }
 222         return i;
 223 }
 224
 225 static void validate_mm(struct mm_struct * mm) {
 226         int bug = 0;
 227         int i = 0;
 228         struct vm_area_struct * tmp = mm->mmap;
 229         while (tmp) {
 230                 tmp = tmp->vm_next;
 231                 i++;
 232         }
 233         if (i != mm->map_count)
 234                 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
 235         i = browse_rb(mm->mm_rb.rb_node);
 236         if (i != mm->map_count)
 237                 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
 238         if (bug)
 239                 BUG();
 240 }
 241 #else
 242 #define validate_mm(mm) do { } while (0)
 243 #endif
 244
 245 static struct vm_area_struct *
 246 find_vma_prepare(struct mm_struct *mm, unsigned long addr,
 247                 struct vm_area_struct **pprev, struct rb_node ***rb_link,
 248                 struct rb_node ** rb_parent)
 249 {
 250         struct vm_area_struct * vma;
 251         struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
 252
 253         __rb_link = &mm->mm_rb.rb_node;
 254         rb_prev = __rb_parent = NULL;
 255         vma = NULL;
 256
 257         while (*__rb_link) {
 258                 struct vm_area_struct *vma_tmp;
 259
 260                 __rb_parent = *__rb_link;
 261                 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
 262
 263                 if (vma_tmp->vm_end > addr) {
 264                         vma = vma_tmp;
 265                         if (vma_tmp->vm_start <= addr)
 266                                 return vma;
 267                         __rb_link = &__rb_parent->rb_left;
 268                 } else {
 269                         rb_prev = __rb_parent;
 270                         __rb_link = &__rb_parent->rb_right;
 271                 }
 272         }
 273
 274         *pprev = NULL;
 275         if (rb_prev)
 276                 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
 277         *rb_link = __rb_link;
 278         *rb_parent = __rb_parent;
 279         return vma;
 280 }
 281
 282 static inline void
 283 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 284                 struct vm_area_struct *prev, struct rb_node *rb_parent)
 285 {
 286         if (prev) {
 287                 vma->vm_next = prev->vm_next;
 288                 prev->vm_next = vma;
 289         } else {
 290                 mm->mmap = vma;
 291                 if (rb_parent)
 292                         vma->vm_next = rb_entry(rb_parent,
 293                                         struct vm_area_struct, vm_rb);
 294                 else
 295                         vma->vm_next = NULL;
 296         }
 297 }
 298
 299 static void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 300                         struct rb_node **rb_link, struct rb_node *rb_parent)
 301 {
 302         rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 303         rb_insert_color(&vma->vm_rb, &mm->mm_rb);
 304 }
 305
 306 static inline void __vma_link_file(struct vm_area_struct *vma)
 307 {
 308         struct file * file;
 309
 310         file = vma->vm_file;
 311         if (file) {
 312                 struct inode * inode = file->f_dentry->d_inode;
 313                 struct address_space *mapping = inode->i_mapping;
 314
 315                 if (vma->vm_flags & VM_DENYWRITE)
 316                         atomic_dec(&inode->i_writecount);
 317
 318                 if (vma->vm_flags & VM_SHARED)
 319                         list_add_tail(&vma->shared, &mapping->i_mmap_shared);
 320                 else
 321                         list_add_tail(&vma->shared, &mapping->i_mmap);
 322         }
 323 }
 324
 325 static void
 326 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 327         struct vm_area_struct *prev, struct rb_node **rb_link,
 328         struct rb_node *rb_parent)
 329 {
 330         __vma_link_list(mm, vma, prev, rb_parent);
 331         __vma_link_rb(mm, vma, rb_link, rb_parent);
 332         __vma_link_file(vma);
 333 }
 334
 335 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 336                         struct vm_area_struct *prev, struct rb_node **rb_link,
 337                         struct rb_node *rb_parent)
 338 {
 339         struct address_space *mapping = NULL;
 340
 341         if (vma->vm_file)
 342                 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
 343
 344         if (mapping)
 345                 down(&mapping->i_shared_sem);
 346         spin_lock(&mm->page_table_lock);
 347         __vma_link(mm, vma, prev, rb_link, rb_parent);
 348         spin_unlock(&mm->page_table_lock);
 349         if (mapping)
 350                 up(&mapping->i_shared_sem);
 351
 352         mark_mm_hugetlb(mm, vma);
 353         mm->map_count++;
 354         validate_mm(mm);
 355 }
 356
 357 /*
 358  * If the vma has a ->close operation then the driver probably needs to release
 359  * per-vma resources, so we don't attempt to merge those.
 360  */
 361 #define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
 362
 363 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 364                         struct file *file, unsigned long vm_flags)
 365 {
 366         if (vma->vm_ops && vma->vm_ops->close)
 367                 return 0;
 368         if (vma->vm_file != file)
 369                 return 0;
 370         if (vma->vm_flags != vm_flags)
 371                 return 0;
 372         if (vma->vm_private_data)
 373                 return 0;
 374         return 1;
 375 }
 376
 377 /*
 378  * Return true if we can merge this (vm_flags,file,vm_pgoff,size)
 379  * in front of (at a lower virtual address and file offset than) the vma.
 380  *
 381  * We don't check here for the merged mmap wrapping around the end of pagecache
 382  * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
 383  * wrap, nor mmaps which cover the final page at index -1UL.
 384  */
 385 static int
 386 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
 387         struct file *file, unsigned long vm_pgoff, unsigned long size)
 388 {
 389         if (is_mergeable_vma(vma, file, vm_flags)) {
 390                 if (!file)
 391                         return 1;       /* anon mapping */
 392                 if (vma->vm_pgoff == vm_pgoff + size)
 393                         return 1;
 394         }
 395         return 0;
 396 }
 397
 398 /*
 399  * Return true if we can merge this (vm_flags,file,vm_pgoff)
 400  * beyond (at a higher virtual address and file offset than) the vma.
 401  */
 402 static int
 403 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
 404         struct file *file, unsigned long vm_pgoff)
 405 {
 406         if (is_mergeable_vma(vma, file, vm_flags)) {
 407                 unsigned long vma_size;
 408
 409                 if (!file)
 410                         return 1;       /* anon mapping */
 411
 412                 vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 413                 if (vma->vm_pgoff + vma_size == vm_pgoff)
 414                         return 1;
 415         }
 416         return 0;
 417 }
 418
 419 /*
 420  * Given a new mapping request (addr,end,vm_flags,file,pgoff), figure out
 421  * whether that can be merged with its predecessor or its successor.  Or
 422  * both (it neatly fills a hole).
 423  */
 424 static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 425                         struct rb_node *rb_parent, unsigned long addr,
 426                         unsigned long end, unsigned long vm_flags,
 427                         struct file *file, unsigned long pgoff)
 428 {
 429         spinlock_t * lock = &mm->page_table_lock;
 430
 431         /*
 432          * We later require that vma->vm_flags == vm_flags, so this tests
 433          * vma->vm_flags & VM_SPECIAL, too.
 434          */
 435         if (vm_flags & VM_SPECIAL)
 436                 return 0;
 437
 438         if (!prev) {
 439                 prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
 440                 goto merge_next;
 441         }
 442
 443         /*
 444          * Can it merge with the predecessor?
 445          */
 446         if (prev->vm_end == addr &&
 447                         is_mergeable_vma(prev, file, vm_flags) &&
 448                         can_vma_merge_after(prev, vm_flags, file, pgoff)) {
 449                 struct vm_area_struct *next;
 450                 struct inode *inode = file ? file->f_dentry->d_inode : NULL;
 451                 int need_up = 0;
 452
 453                 if (unlikely(file && prev->vm_next &&
 454                                 prev->vm_next->vm_file == file)) {
 455                         down(&inode->i_mapping->i_shared_sem);
 456                         need_up = 1;
 457                 }
 458                 spin_lock(lock);
 459                 prev->vm_end = end;
 460
 461                 /*
 462                  * OK, it did.  Can we now merge in the successor as well?
 463                  */
 464                 next = prev->vm_next;
 465                 if (next && prev->vm_end == next->vm_start &&
 466                                 can_vma_merge_before(next, vm_flags, file,
 467                                         pgoff, (end - addr) >> PAGE_SHIFT)) {
 468                         prev->vm_end = next->vm_end;
 469                         __vma_unlink(mm, next, prev);
 470                         __remove_shared_vm_struct(next, inode);
 471                         spin_unlock(lock);
 472                         if (need_up)
 473                                 up(&inode->i_mapping->i_shared_sem);
 474
 475                         mm->map_count--;
 476                         kmem_cache_free(vm_area_cachep, next);
 477                         return 1;
 478                 }
 479                 spin_unlock(lock);
 480                 if (need_up)
 481                         up(&inode->i_mapping->i_shared_sem);
 482                 return 1;
 483         }
 484
 485         /*
 486          * Can this new request be merged in front of prev->vm_next?
 487          */
 488         prev = prev->vm_next;
 489         if (prev) {
 490  merge_next:
 491                 if (!can_vma_merge_before(prev, vm_flags, file,
 492                                 pgoff, (end - addr) >> PAGE_SHIFT))
 493                         return 0;
 494                 if (end == prev->vm_start) {
 495                         spin_lock(lock);
 496                         prev->vm_start = addr;
 497                         prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT;
 498                         spin_unlock(lock);
 499                         return 1;
 500                 }
 501         }
 502
 503         return 0;
 504 }
 505
 506 /*
 507  * The caller must hold down_write(current->mm->mmap_sem).
 508  */
 509
 510 unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 511                         unsigned long len, unsigned long prot,
 512                         unsigned long flags, unsigned long pgoff)
 513 {
 514         struct mm_struct * mm = current->mm;
 515         struct vm_area_struct * vma, * prev;
 516         struct inode *inode;
 517         unsigned int vm_flags;
 518         int correct_wcount = 0;
 519         int error;
 520         struct rb_node ** rb_link, * rb_parent;
 521         unsigned long charged = 0;
 522
 523         if (file && (!file->f_op || !file->f_op->mmap))
 524                 return -ENODEV;
 525
 526         if (!len)
 527                 return addr;
 528
 529         if (len > TASK_SIZE)
 530                 return -EINVAL;
 531
 532         len = PAGE_ALIGN(len);
 533
 534         /* offset overflow? */
 535         if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
 536                 return -EINVAL;
 537
 538         /* Too many mappings? */
 539         if (mm->map_count > MAX_MAP_COUNT)
 540                 return -ENOMEM;
 541
 542         /* Obtain the address to map to. we verify (or select) it and ensure
 543          * that it represents a valid section of the address space.
 544          */
 545         addr = get_unmapped_area(file, addr, len, pgoff, flags);
 546         if (addr & ~PAGE_MASK)
 547                 return addr;
 548
 549         /* Do simple checking here so the lower-level routines won't have
 550          * to. we assume access permissions have been handled by the open
 551          * of the memory object, so we don't do any here.
 552          */
 553         vm_flags = calc_vm_flags(prot,flags) | mm->def_flags |
 554                         VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 555
 556         if (flags & MAP_LOCKED) {
 557                 if (!capable(CAP_IPC_LOCK))
 558                         return -EPERM;
 559                 vm_flags |= VM_LOCKED;
 560         }
 561         /* mlock MCL_FUTURE? */
 562         if (vm_flags & VM_LOCKED) {
 563                 unsigned long locked = mm->locked_vm << PAGE_SHIFT;
 564                 locked += len;
 565                 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
 566                         return -EAGAIN;
 567         }
 568
 569         inode = file ? file->f_dentry->d_inode : NULL;
 570
 571         if (file) {
 572                 switch (flags & MAP_TYPE) {
 573                 case MAP_SHARED:
 574                         if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 575                                 return -EACCES;
 576
 577                         /*
 578                          * Make sure we don't allow writing to an append-only
 579                          * file..
 580                          */
 581                         if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
 582                                 return -EACCES;
 583
 584                         /*
 585                          * Make sure there are no mandatory locks on the file.
 586                          */
 587                         if (locks_verify_locked(inode))
 588                                 return -EAGAIN;
 589
 590                         vm_flags |= VM_SHARED | VM_MAYSHARE;
 591                         if (!(file->f_mode & FMODE_WRITE))
 592                                 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
 593
 594                         /* fall through */
 595                 case MAP_PRIVATE:
 596                         if (!(file->f_mode & FMODE_READ))
 597                                 return -EACCES;
 598                         break;
 599
 600                 default:
 601                         return -EINVAL;
 602                 }
 603         } else {
 604                 vm_flags |= VM_SHARED | VM_MAYSHARE;
 605                 switch (flags & MAP_TYPE) {
 606                 default:
 607                         return -EINVAL;
 608                 case MAP_PRIVATE:
 609                         vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
 610                         /* fall through */
 611                 case MAP_SHARED:
 612                         break;
 613                 }
 614         }
 615
 616         error = security_file_mmap(file, prot, flags);
 617         if (error)
 618                 return error;
 619
 620         /* Clear old maps */
 621         error = -ENOMEM;
 622 munmap_back:
 623         vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 624         if (vma && vma->vm_start < addr + len) {
 625                 if (do_munmap(mm, addr, len))
 626                         return -ENOMEM;
 627                 goto munmap_back;
 628         }
 629
 630         /* Check against address space limit. */
 631         if ((mm->total_vm << PAGE_SHIFT) + len
 632             > current->rlim[RLIMIT_AS].rlim_cur)
 633                 return -ENOMEM;
 634
 635         if (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory > 1) {
 636                 if (vm_flags & VM_SHARED) {
 637                         /* Check memory availability in shmem_file_setup? */
 638                         vm_flags |= VM_ACCOUNT;
 639                 } else if (vm_flags & VM_WRITE) {
 640                         /*
 641                          * Private writable mapping: check memory availability
 642                          */
 643                         charged = len >> PAGE_SHIFT;
 644                         if (!vm_enough_memory(charged))
 645                                 return -ENOMEM;
 646                         vm_flags |= VM_ACCOUNT;
 647                 }
 648         }
 649
 650         /* Can we just expand an old anonymous mapping? */
 651         if (!file && !(vm_flags & VM_SHARED) && rb_parent)
 652                 if (vma_merge(mm, prev, rb_parent, addr, addr + len,
 653                                         vm_flags, NULL, 0))
 654                         goto out;
 655
 656         /*
 657          * Determine the object being mapped and call the appropriate
 658          * specific mapper. the address has already been validated, but
 659          * not unmapped, but the maps are removed from the list.
 660          */
 661         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 662         error = -ENOMEM;
 663         if (!vma)
 664                 goto unacct_error;
 665
 666         vma->vm_mm = mm;
 667         vma->vm_start = addr;
 668         vma->vm_end = addr + len;
 669         vma->vm_flags = vm_flags;
 670         vma->vm_page_prot = protection_map[vm_flags & 0x0f];
 671         vma->vm_ops = NULL;
 672         vma->vm_pgoff = pgoff;
 673         vma->vm_file = NULL;
 674         vma->vm_private_data = NULL;
 675         INIT_LIST_HEAD(&vma->shared);
 676
 677         if (file) {
 678                 error = -EINVAL;
 679                 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
 680                         goto free_vma;
 681                 if (vm_flags & VM_DENYWRITE) {
 682                         error = deny_write_access(file);
 683                         if (error)
 684                                 goto free_vma;
 685                         correct_wcount = 1;
 686                 }
 687                 vma->vm_file = file;
 688                 get_file(file);
 689                 error = file->f_op->mmap(file, vma);
 690                 if (error)
 691                         goto unmap_and_free_vma;
 692         } else if (vm_flags & VM_SHARED) {
 693                 error = shmem_zero_setup(vma);
 694                 if (error)
 695                         goto free_vma;
 696         }
 697
 698         /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
 699          * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
 700          * that memory reservation must be checked; but that reservation
 701          * belongs to shared memory object, not to vma: so now clear it.
 702          */
 703         if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
 704                 vma->vm_flags &= ~VM_ACCOUNT;
 705
 706         /* Can addr have changed??
 707          *
 708          * Answer: Yes, several device drivers can do it in their
 709          *         f_op->mmap method. -DaveM
 710          */
 711         addr = vma->vm_start;
 712
 713         if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr,
 714                                 addr + len, vma->vm_flags, file, pgoff)) {
 715                 vma_link(mm, vma, prev, rb_link, rb_parent);
 716                 if (correct_wcount)
 717                         atomic_inc(&inode->i_writecount);
 718         } else {
 719                 if (file) {
 720                         if (correct_wcount)
 721                                 atomic_inc(&inode->i_writecount);
 722                         fput(file);
 723                 }
 724                 kmem_cache_free(vm_area_cachep, vma);
 725         }
 726 out:
 727         mm->total_vm += len >> PAGE_SHIFT;
 728         if (vm_flags & VM_LOCKED) {
 729                 mm->locked_vm += len >> PAGE_SHIFT;
 730                 make_pages_present(addr, addr + len);
 731         }
 732         if (flags & MAP_POPULATE) {
 733                 up_write(&mm->mmap_sem);
 734                 sys_remap_file_pages(addr, len, prot,
 735                                         pgoff, flags & MAP_NONBLOCK);
 736                 down_write(&mm->mmap_sem);
 737         }
 738         return addr;
 739
 740 unmap_and_free_vma:
 741         if (correct_wcount)
 742                 atomic_inc(&inode->i_writecount);
 743         vma->vm_file = NULL;
 744         fput(file);
 745
 746         /* Undo any partial mapping done by a device driver. */
 747         zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
 748 free_vma:
 749         kmem_cache_free(vm_area_cachep, vma);
 750 unacct_error:
 751         if (charged)
 752                 vm_unacct_memory(charged);
 753         return error;
 754 }
 755
 756 /* Get an address range which is currently unmapped.
 757  * For shmat() with addr=0.
 758  *
 759  * Ugly calling convention alert:
 760  * Return value with the low bits set means error value,
 761  * ie
 762  *      if (ret & ~PAGE_MASK)
 763  *              error = ret;
 764  *
 765  * This function "knows" that -ENOMEM has the bits set.
 766  */
 767 #ifndef HAVE_ARCH_UNMAPPED_AREA
 768 static inline unsigned long
 769 arch_get_unmapped_area(struct file *filp, unsigned long addr,
 770                 unsigned long len, unsigned long pgoff, unsigned long flags)
 771 {
 772         struct mm_struct *mm = current->mm;
 773         struct vm_area_struct *vma;
 774         int found_hole = 0;
 775
 776         if (len > TASK_SIZE)
 777                 return -ENOMEM;
 778
 779         if (addr) {
 780                 addr = PAGE_ALIGN(addr);
 781                 vma = find_vma(mm, addr);
 782                 if (TASK_SIZE - len >= addr &&
 783                     (!vma || addr + len <= vma->vm_start))
 784                         return addr;
 785         }
 786         addr = mm->free_area_cache;
 787
 788         for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 789                 /* At this point:  (!vma || addr < vma->vm_end). */
 790                 if (TASK_SIZE - len < addr)
 791                         return -ENOMEM;
 792                 /*
 793                  * Record the first available hole.
 794                  */
 795                 if (!found_hole && (!vma || addr < vma->vm_start)) {
 796                         mm->free_area_cache = addr;
 797                         found_hole = 1;
 798                 }
 799                 if (!vma || addr + len <= vma->vm_start)
 800                         return addr;
 801                 addr = vma->vm_end;
 802         }
 803 }
 804 #else
 805 extern unsigned long
 806 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
 807                         unsigned long, unsigned long);
 808 #endif
 809
 810 unsigned long
 811 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 812                 unsigned long pgoff, unsigned long flags)
 813 {
 814         if (flags & MAP_FIXED) {
 815                 unsigned long ret;
 816
 817                 if (addr > TASK_SIZE - len)
 818                         return -ENOMEM;
 819                 if (addr & ~PAGE_MASK)
 820                         return -EINVAL;
 821                 if (file && is_file_hugepages(file))  {
 822                         /*
 823                          * Make sure that addr and length are properly aligned.
 824                          */
 825                         ret = is_aligned_hugepage_range(addr, len);
 826                 } else {
 827                         /*
 828                          * Ensure that a normal request is not falling in a
 829                          * reserved hugepage range.  For some archs like IA-64,
 830                          * there is a separate region for hugepages.
 831                          */
 832                         ret = check_valid_hugepage_range(addr, len);
 833                 }
 834                 if (ret)
 835                         return ret;
 836                 return addr;
 837         }
 838
 839         if (file && file->f_op && file->f_op->get_unmapped_area)
 840                 return file->f_op->get_unmapped_area(file, addr, len,
 841                                                 pgoff, flags);
 842
 843         return arch_get_unmapped_area(file, addr, len, pgoff, flags);
 844 }
 845
 846 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 847 struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
 848 {
 849         struct vm_area_struct *vma = NULL;
 850
 851         if (mm) {
 852                 /* Check the cache first. */
 853                 /* (Cache hit rate is typically around 35%.) */
 854                 vma = mm->mmap_cache;
 855                 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
 856                         struct rb_node * rb_node;
 857
 858                         rb_node = mm->mm_rb.rb_node;
 859                         vma = NULL;
 860
 861                         while (rb_node) {
 862                                 struct vm_area_struct * vma_tmp;
 863
 864                                 vma_tmp = rb_entry(rb_node,
 865                                                 struct vm_area_struct, vm_rb);
 866
 867                                 if (vma_tmp->vm_end > addr) {
 868                                         vma = vma_tmp;
 869                                         if (vma_tmp->vm_start <= addr)
 870                                                 break;
 871                                         rb_node = rb_node->rb_left;
 872                                 } else
 873                                         rb_node = rb_node->rb_right;
 874                         }
 875                         if (vma)
 876                                 mm->mmap_cache = vma;
 877                 }
 878         }
 879         return vma;
 880 }
 881
 882 /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
 883 struct vm_area_struct *
 884 find_vma_prev(struct mm_struct *mm, unsigned long addr,
 885                         struct vm_area_struct **pprev)
 886 {
 887         struct vm_area_struct *vma = NULL, *prev = NULL;
 888         struct rb_node * rb_node;
 889         if (!mm)
 890                 goto out;
 891
 892         /* Guard against addr being lower than the first VMA */
 893         vma = mm->mmap;
 894
 895         /* Go through the RB tree quickly. */
 896         rb_node = mm->mm_rb.rb_node;
 897
 898         while (rb_node) {
 899                 struct vm_area_struct *vma_tmp;
 900                 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
 901
 902                 if (addr < vma_tmp->vm_end) {
 903                         rb_node = rb_node->rb_left;
 904                 } else {
 905                         prev = vma_tmp;
 906                         if (!prev->vm_next || (addr < prev->vm_next->vm_end))
 907                                 break;
 908                         rb_node = rb_node->rb_right;
 909                 }
 910         }
 911
 912 out:
 913         *pprev = prev;
 914         return prev ? prev->vm_next : vma;
 915 }
 916
 917 #ifdef CONFIG_STACK_GROWSUP
 918 /*
 919  * vma is the first one with address > vma->vm_end.  Have to extend vma.
 920  */
 921 int expand_stack(struct vm_area_struct * vma, unsigned long address)
 922 {
 923         unsigned long grow;
 924
 925         if (!(vma->vm_flags & VM_GROWSUP))
 926                 return -EFAULT;
 927
 928         /*
 929          * vma->vm_start/vm_end cannot change under us because the caller
 930          * is required to hold the mmap_sem in read mode. We need to get
 931          * the spinlock only before relocating the vma range ourself.
 932          */
 933         address += 4 + PAGE_SIZE - 1;
 934         address &= PAGE_MASK;
 935         spin_lock(&vma->vm_mm->page_table_lock);
 936         grow = (address - vma->vm_end) >> PAGE_SHIFT;
 937
 938         /* Overcommit.. */
 939         if (!vm_enough_memory(grow)) {
 940                 spin_unlock(&vma->vm_mm->page_table_lock);
 941                 return -ENOMEM;
 942         }
 943
 944         if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur ||
 945                         ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
 946                         current->rlim[RLIMIT_AS].rlim_cur) {
 947                 spin_unlock(&vma->vm_mm->page_table_lock);
 948                 vm_unacct_memory(grow);
 949                 return -ENOMEM;
 950         }
 951         vma->vm_end = address;
 952         vma->vm_mm->total_vm += grow;
 953         if (vma->vm_flags & VM_LOCKED)
 954                 vma->vm_mm->locked_vm += grow;
 955         spin_unlock(&vma->vm_mm->page_table_lock);
 956         return 0;
 957 }
 958
 959 struct vm_area_struct *
 960 find_extend_vma(struct mm_struct *mm, unsigned long addr)
 961 {
 962         struct vm_area_struct *vma, *prev;
 963
 964         addr &= PAGE_MASK;
 965         vma = find_vma_prev(mm, addr, &prev);
 966         if (vma && (vma->vm_start <= addr))
 967                 return vma;
 968         if (!prev || expand_stack(prev, addr))
 969                 return NULL;
 970         if (prev->vm_flags & VM_LOCKED) {
 971                 make_pages_present(addr, prev->vm_end);
 972         }
 973         return prev;
 974 }
 975 #else
 976 /*
 977  * vma is the first one with address < vma->vm_start.  Have to extend vma.
 978  */
 979 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 980 {
 981         unsigned long grow;
 982
 983         /*
 984          * vma->vm_start/vm_end cannot change under us because the caller
 985          * is required to hold the mmap_sem in read mode. We need to get
 986          * the spinlock only before relocating the vma range ourself.
 987          */
 988         address &= PAGE_MASK;
 989         spin_lock(&vma->vm_mm->page_table_lock);
 990         grow = (vma->vm_start - address) >> PAGE_SHIFT;
 991
 992         /* Overcommit.. */
 993         if (!vm_enough_memory(grow)) {
 994                 spin_unlock(&vma->vm_mm->page_table_lock);
 995                 return -ENOMEM;
 996         }
 997
 998         if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur ||
 999                         ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) >
1000                         current->rlim[RLIMIT_AS].rlim_cur) {
1001                 spin_unlock(&vma->vm_mm->page_table_lock);
1002                 vm_unacct_memory(grow);
1003                 return -ENOMEM;
1004         }
1005         vma->vm_start = address;
1006         vma->vm_pgoff -= grow;
1007         vma->vm_mm->total_vm += grow;
1008         if (vma->vm_flags & VM_LOCKED)
1009                 vma->vm_mm->locked_vm += grow;
1010         spin_unlock(&vma->vm_mm->page_table_lock);
1011         return 0;
1012 }
1013
1014 struct vm_area_struct *
1015 find_extend_vma(struct mm_struct * mm, unsigned long addr)
1016 {
1017         struct vm_area_struct * vma;
1018         unsigned long start;
1019
1020         addr &= PAGE_MASK;
1021         vma = find_vma(mm,addr);
1022         if (!vma)
1023                 return NULL;
1024         if (vma->vm_start <= addr)
1025                 return vma;
1026         if (!(vma->vm_flags & VM_GROWSDOWN))
1027                 return NULL;
1028         start = vma->vm_start;
1029         if (expand_stack(vma, addr))
1030                 return NULL;
1031         if (vma->vm_flags & VM_LOCKED) {
1032                 make_pages_present(addr, start);
1033         }
1034         return vma;
1035 }
1036 #endif
1037
1038 /*
1039  * Try to free as many page directory entries as we can,
1040  * without having to work very hard at actually scanning
1041  * the page tables themselves.
1042  *
1043  * Right now we try to free page tables if we have a nice
1044  * PGDIR-aligned area that got free'd up. We could be more
1045  * granular if we want to, but this is fast and simple,
1046  * and covers the bad cases.
1047  *
1048  * "prev", if it exists, points to a vma before the one
1049  * we just free'd - but there's no telling how much before.
1050  */
1051 static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
1052         unsigned long start, unsigned long end)
1053 {
1054         unsigned long first = start & PGDIR_MASK;
1055         unsigned long last = end + PGDIR_SIZE - 1;
1056         unsigned long start_index, end_index;
1057         struct mm_struct *mm = tlb->mm;
1058
1059         if (!prev) {
1060                 prev = mm->mmap;
1061                 if (!prev)
1062                         goto no_mmaps;
1063                 if (prev->vm_end > start) {
1064                         if (last > prev->vm_start)
1065                                 last = prev->vm_start;
1066                         goto no_mmaps;
1067                 }
1068         }
1069         for (;;) {
1070                 struct vm_area_struct *next = prev->vm_next;
1071
1072                 if (next) {
1073                         if (next->vm_start < start) {
1074                                 prev = next;
1075                                 continue;
1076                         }
1077                         if (last > next->vm_start)
1078                                 last = next->vm_start;
1079                 }
1080                 if (prev->vm_end > first)
1081                         first = prev->vm_end + PGDIR_SIZE - 1;
1082                 break;
1083         }
1084 no_mmaps:
1085         if (last < first)       /* for arches with discontiguous pgd indices */
1086                 return;
1087         /*
1088          * If the PGD bits are not consecutive in the virtual address, the
1089          * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
1090          */
1091         start_index = pgd_index(first);
1092         if (start_index < FIRST_USER_PGD_NR)
1093                 start_index = FIRST_USER_PGD_NR;
1094         end_index = pgd_index(last);
1095         if (end_index > start_index) {
1096                 clear_page_tables(tlb, start_index, end_index - start_index);
1097                 flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
1098         }
1099 }
1100
1101 /* Normal function to fix up a mapping
1102  * This function is the default for when an area has no specific
1103  * function.  This may be used as part of a more specific routine.
1104  *
1105  * By the time this function is called, the area struct has been
1106  * removed from the process mapping list.
1107  */
1108 static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1109 {
1110         size_t len = area->vm_end - area->vm_start;
1111
1112         area->vm_mm->total_vm -= len >> PAGE_SHIFT;
1113         if (area->vm_flags & VM_LOCKED)
1114                 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1115         /*
1116          * Is this a new hole at the lowest possible address?
1117          */
1118         if (area->vm_start >= TASK_UNMAPPED_BASE &&
1119                                 area->vm_start < area->vm_mm->free_area_cache)
1120               area->vm_mm->free_area_cache = area->vm_start;
1121
1122         remove_shared_vm_struct(area);
1123
1124         if (area->vm_ops && area->vm_ops->close)
1125                 area->vm_ops->close(area);
1126         if (area->vm_file)
1127                 fput(area->vm_file);
1128         kmem_cache_free(vm_area_cachep, area);
1129 }
1130
1131 /*
1132  * Update the VMA and inode share lists.
1133  *
1134  * Ok - we have the memory areas we should free on the 'free' list,
1135  * so release them, and do the vma updates.
1136  */
1137 static void unmap_vma_list(struct mm_struct *mm,
1138         struct vm_area_struct *mpnt)
1139 {
1140         do {
1141                 struct vm_area_struct *next = mpnt->vm_next;
1142                 unmap_vma(mm, mpnt);
1143                 mpnt = next;
1144         } while (mpnt != NULL);
1145         validate_mm(mm);
1146 }
1147
1148 /*
1149  * Get rid of page table information in the indicated region.
1150  *
1151  * Called with the page table lock held.
1152  */
1153 static void unmap_region(struct mm_struct *mm,
1154         struct vm_area_struct *vma,
1155         struct vm_area_struct *prev,
1156         unsigned long start,
1157         unsigned long end)
1158 {
1159         struct mmu_gather *tlb;
1160         unsigned long nr_accounted = 0;
1161
1162         lru_add_drain();
1163         tlb = tlb_gather_mmu(mm, 0);
1164         unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted);
1165         vm_unacct_memory(nr_accounted);
1166         free_pgtables(tlb, prev, start, end);
1167         tlb_finish_mmu(tlb, start, end);
1168 }
1169
1170 /*
1171  * Create a list of vma's touched by the unmap, removing them from the mm's
1172  * vma list as we go..
1173  *
1174  * Called with the page_table_lock held.
1175  */
1176 static void
1177 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1178         struct vm_area_struct *prev, unsigned long end)
1179 {
1180         struct vm_area_struct **insertion_point;
1181         struct vm_area_struct *tail_vma = NULL;
1182
1183         insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1184         do {
1185                 rb_erase(&vma->vm_rb, &mm->mm_rb);
1186                 mm->map_count--;
1187                 tail_vma = vma;
1188                 vma = vma->vm_next;
1189         } while (vma && vma->vm_start < end);
1190         *insertion_point = vma;
1191         tail_vma->vm_next = NULL;
1192         mm->mmap_cache = NULL;          /* Kill the cache. */
1193 }
1194
1195 /*
1196  * Split a vma into two pieces at address 'addr', a new vma is allocated
1197  * either for the first part or the the tail.
1198  */
1199 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1200               unsigned long addr, int new_below)
1201 {
1202         struct vm_area_struct *new;
1203
1204         if (mm->map_count >= MAX_MAP_COUNT)
1205                 return -ENOMEM;
1206
1207         new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1208         if (!new)
1209                 return -ENOMEM;
1210
1211         /* most fields are the same, copy all, and then fixup */
1212         *new = *vma;
1213
1214         INIT_LIST_HEAD(&new->shared);
1215
1216         if (new_below) {
1217                 new->vm_end = addr;
1218                 vma->vm_start = addr;
1219                 vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT);
1220         } else {
1221                 vma->vm_end = addr;
1222                 new->vm_start = addr;
1223                 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
1224         }
1225
1226         if (new->vm_file)
1227                 get_file(new->vm_file);
1228
1229         if (new->vm_ops && new->vm_ops->open)
1230                 new->vm_ops->open(new);
1231
1232         insert_vm_struct(mm, new);
1233         return 0;
1234 }
1235
1236 /* Munmap is split into 2 main parts -- this part which finds
1237  * what needs doing, and the areas themselves, which do the
1238  * work.  This now handles partial unmappings.
1239  * Jeremy Fitzhardinge <jeremy@goop.org>
1240  */
1241 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1242 {
1243         unsigned long end;
1244         struct vm_area_struct *mpnt, *prev, *last;
1245
1246         if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
1247                 return -EINVAL;
1248
1249         if ((len = PAGE_ALIGN(len)) == 0)
1250                 return -EINVAL;
1251
1252         /* Find the first overlapping VMA */
1253         mpnt = find_vma_prev(mm, start, &prev);
1254         if (!mpnt)
1255                 return 0;
1256         /* we have  start < mpnt->vm_end  */
1257
1258         if (is_vm_hugetlb_page(mpnt)) {
1259                 int ret = is_aligned_hugepage_range(start, len);
1260
1261                 if (ret)
1262                         return ret;
1263         }
1264
1265         /* if it doesn't overlap, we have nothing.. */
1266         end = start + len;
1267         if (mpnt->vm_start >= end)
1268                 return 0;
1269
1270         /* Something will probably happen, so notify. */
1271         if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC))
1272                 profile_exec_unmap(mm);
1273
1274         /*
1275          * If we need to split any vma, do it now to save pain later.
1276          *
1277          * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
1278          * unmapped vm_area_struct will remain in use: so lower split_vma
1279          * places tmp vma above, and higher split_vma places tmp vma below.
1280          */
1281         if (start > mpnt->vm_start) {
1282                 if (split_vma(mm, mpnt, start, 0))
1283                         return -ENOMEM;
1284                 prev = mpnt;
1285         }
1286
1287         /* Does it split the last one? */
1288         last = find_vma(mm, end);
1289         if (last && end > last->vm_start) {
1290                 if (split_vma(mm, last, end, 1))
1291                         return -ENOMEM;
1292         }
1293         mpnt = prev? prev->vm_next: mm->mmap;
1294
1295         /*
1296          * Remove the vma's, and unmap the actual pages
1297          */
1298         spin_lock(&mm->page_table_lock);
1299         detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1300         unmap_region(mm, mpnt, prev, start, end);
1301         spin_unlock(&mm->page_table_lock);
1302
1303         /* Fix up all other VM information */
1304         unmap_vma_list(mm, mpnt);
1305
1306         return 0;
1307 }
1308
1309 asmlinkage long sys_munmap(unsigned long addr, size_t len)
1310 {
1311         int ret;
1312         struct mm_struct *mm = current->mm;
1313
1314         down_write(&mm->mmap_sem);
1315         ret = do_munmap(mm, addr, len);
1316         up_write(&mm->mmap_sem);
1317         return ret;
1318 }
1319
1320 /*
1321  *  this is really a simplified "do_mmap".  it only handles
1322  *  anonymous maps.  eventually we may be able to do some
1323  *  brk-specific accounting here.
1324  */
1325 unsigned long do_brk(unsigned long addr, unsigned long len)
1326 {
1327         struct mm_struct * mm = current->mm;
1328         struct vm_area_struct * vma, * prev;
1329         unsigned long flags;
1330         struct rb_node ** rb_link, * rb_parent;
1331
1332         len = PAGE_ALIGN(len);
1333         if (!len)
1334                 return addr;
1335
1336         /*
1337          * mlock MCL_FUTURE?
1338          */
1339         if (mm->def_flags & VM_LOCKED) {
1340                 unsigned long locked = mm->locked_vm << PAGE_SHIFT;
1341                 locked += len;
1342                 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
1343                         return -EAGAIN;
1344         }
1345
1346         /*
1347          * Clear old maps.  this also does some error checking for us
1348          */
1349  munmap_back:
1350         vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1351         if (vma && vma->vm_start < addr + len) {
1352                 if (do_munmap(mm, addr, len))
1353                         return -ENOMEM;
1354                 goto munmap_back;
1355         }
1356
1357         /* Check against address space limits *after* clearing old maps... */
1358         if ((mm->total_vm << PAGE_SHIFT) + len
1359             > current->rlim[RLIMIT_AS].rlim_cur)
1360                 return -ENOMEM;
1361
1362         if (mm->map_count > MAX_MAP_COUNT)
1363                 return -ENOMEM;
1364
1365         if (!vm_enough_memory(len >> PAGE_SHIFT))
1366                 return -ENOMEM;
1367
1368         flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1369
1370         /* Can we just expand an old anonymous mapping? */
1371         if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len,
1372                                         flags, NULL, 0))
1373                 goto out;
1374
1375         /*
1376          * create a vma struct for an anonymous mapping
1377          */
1378         vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1379         if (!vma) {
1380                 vm_unacct_memory(len >> PAGE_SHIFT);
1381                 return -ENOMEM;
1382         }
1383
1384         vma->vm_mm = mm;
1385         vma->vm_start = addr;
1386         vma->vm_end = addr + len;
1387         vma->vm_flags = flags;
1388         vma->vm_page_prot = protection_map[flags & 0x0f];
1389         vma->vm_ops = NULL;
1390         vma->vm_pgoff = 0;
1391         vma->vm_file = NULL;
1392         vma->vm_private_data = NULL;
1393         INIT_LIST_HEAD(&vma->shared);
1394
1395         vma_link(mm, vma, prev, rb_link, rb_parent);
1396
1397 out:
1398         mm->total_vm += len >> PAGE_SHIFT;
1399         if (flags & VM_LOCKED) {
1400                 mm->locked_vm += len >> PAGE_SHIFT;
1401                 make_pages_present(addr, addr + len);
1402         }
1403         return addr;
1404 }
1405
1406 /* Build the RB tree corresponding to the VMA list. */
1407 void build_mmap_rb(struct mm_struct * mm)
1408 {
1409         struct vm_area_struct * vma;
1410         struct rb_node ** rb_link, * rb_parent;
1411
1412         mm->mm_rb = RB_ROOT;
1413         rb_link = &mm->mm_rb.rb_node;
1414         rb_parent = NULL;
1415         for (vma = mm->mmap; vma; vma = vma->vm_next) {
1416                 __vma_link_rb(mm, vma, rb_link, rb_parent);
1417                 rb_parent = &vma->vm_rb;
1418                 rb_link = &rb_parent->rb_right;
1419         }
1420 }
1421
1422 /* Release all mmaps. */
1423 void exit_mmap(struct mm_struct *mm)
1424 {
1425         struct mmu_gather *tlb;
1426         struct vm_area_struct *vma;
1427         unsigned long nr_accounted = 0;
1428
1429         profile_exit_mmap(mm);
1430
1431         lru_add_drain();
1432
1433         spin_lock(&mm->page_table_lock);
1434
1435         tlb = tlb_gather_mmu(mm, 1);
1436         flush_cache_mm(mm);
1437         /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
1438         mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
1439                                         ~0UL, &nr_accounted);
1440         vm_unacct_memory(nr_accounted);
1441         BUG_ON(mm->map_count);  /* This is just debugging */
1442         clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
1443         tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1444
1445         vma = mm->mmap;
1446         mm->mmap = mm->mmap_cache = NULL;
1447         mm->mm_rb = RB_ROOT;
1448         mm->rss = 0;
1449         mm->total_vm = 0;
1450         mm->locked_vm = 0;
1451
1452         spin_unlock(&mm->page_table_lock);
1453
1454         /*
1455          * Walk the list again, actually closing and freeing it
1456          * without holding any MM locks.
1457          */
1458         while (vma) {
1459                 struct vm_area_struct *next = vma->vm_next;
1460                 remove_shared_vm_struct(vma);
1461                 if (vma->vm_ops) {
1462                         if (vma->vm_ops->close)
1463                                 vma->vm_ops->close(vma);
1464                 }
1465                 if (vma->vm_file)
1466                         fput(vma->vm_file);
1467                 kmem_cache_free(vm_area_cachep, vma);
1468                 vma = next;
1469         }
1470 }
1471
1472 /* Insert vm structure into process list sorted by address
1473  * and into the inode's i_mmap ring.  If vm_file is non-NULL
1474  * then i_shared_sem is taken here.
1475  */
1476 void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
1477 {
1478         struct vm_area_struct * __vma, * prev;
1479         struct rb_node ** rb_link, * rb_parent;
1480
1481         __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
1482         if (__vma && __vma->vm_start < vma->vm_end)
1483                 BUG();
1484         vma_link(mm, vma, prev, rb_link, rb_parent);
1485         validate_mm(mm);
1486 }