Update to 3.4-final.

[linux-flexiantxendom0-3.2.10.git] / fs / namei.c
diff --git a/fs/namei.c b/fs/namei.c

index 2826db3..d1ebe96 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -15,7 +15,7 @@
   */
  
  #include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
  #include <linux/slab.h>
  #include <linux/fs.h>
  #include <linux/namei.h>
@@ -36,6 +36,7 @@
  #include <asm/uaccess.h>
  
  #include "internal.h"
+#include "mount.h"
  
  /* [Feb-1997 T. Schoebel-Theuer]
   * Fundamental changes in the pathname lookup mechanisms (namei)
@@ -137,21 +138,21 @@ static int do_getname(const char __user *filename, char *page)
         return retval;
  }
  
-static char *getname_flags(const char __user * filename, int flags)
+static char *getname_flags(const char __user *filename, int flags, int *empty)
  {
-       char *tmp, *result;
+       char *result = __getname();
+       int retval;
  
-       result = ERR_PTR(-ENOMEM);
-       tmp = __getname();
-       if (tmp)  {
-               int retval = do_getname(filename, tmp);
+       if (!result)
+               return ERR_PTR(-ENOMEM);
  
-               result = tmp;
-               if (retval < 0) {
-                       if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
-                               __putname(tmp);
-                               result = ERR_PTR(retval);
-                       }
+       retval = do_getname(filename, result);
+       if (retval < 0) {
+               if (retval == -ENOENT && empty)
+                       *empty = 1;
+               if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+                       __putname(result);
+                       return ERR_PTR(retval);
                 }
         }
         audit_getname(result);
@@ -160,7 +161,7 @@ static char *getname_flags(const char __user * filename, int flags)
  
  char *getname(const char __user * filename)
  {
-       return getname_flags(filename, 0);
+       return getname_flags(filename, 0, NULL);
  }
  
  #ifdef CONFIG_AUDITSYSCALL
@@ -221,14 +222,12 @@ static int check_acl(struct inode *inode, int mask)
  }
  
  /*
- * This does basic POSIX ACL permission checking
+ * This does the basic permission checking
   */
  static int acl_permission_check(struct inode *inode, int mask)
  {
         unsigned int mode = inode->i_mode;
  
-       mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
-
         if (current_user_ns() != inode_userns(inode))
                 goto other_perms;
  
@@ -257,7 +256,7 @@ other_perms:
  /**
   * generic_permission -  check for access rights on a Posix-like filesystem
   * @inode:     inode to check access rights for
- * @mask:      right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask:      right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
   *
   * Used to check for read/write/execute permissions on a file.
   * We use "fsuid" for this, letting us set arbitrary permissions
@@ -273,7 +272,7 @@ int generic_permission(struct inode *inode, int mask)
         int ret;
  
         /*
-        * Do the basic POSIX ACL permission checks.
+        * Do the basic permission checks.
          */
         ret = acl_permission_check(inode, mask);
         if (ret != -EACCES)
@@ -331,12 +330,14 @@ static inline int do_inode_permission(struct inode *inode, int mask)
  /**
   * inode_permission  -  check for access rights to a given inode
   * @inode:     inode to check permission on
- * @mask:      right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @mask:      right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
   *
   * Used to check for read/write/execute permissions on an inode.
   * We use "fsuid" for this, letting us set arbitrary permissions
   * for filesystem access without changing the "normal" uids which
   * are used for other things.
+ *
+ * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
   */
  int inode_permission(struct inode *inode, int mask)
  {
@@ -641,7 +642,7 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
         cond_resched();
         current->total_link_count++;
  
-       touch_atime(link->mnt, dentry);
+       touch_atime(link);
         nd_set_link(nd, NULL);
  
         error = security_inode_follow_link(link->dentry, nd);
@@ -674,36 +675,38 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
  
  static int follow_up_rcu(struct path *path)
  {
-       struct vfsmount *parent;
+       struct mount *mnt = real_mount(path->mnt);
+       struct mount *parent;
         struct dentry *mountpoint;
  
-       parent = path->mnt->mnt_parent;
-       if (parent == path->mnt)
+       parent = mnt->mnt_parent;
+       if (&parent->mnt == path->mnt)
                 return 0;
-       mountpoint = path->mnt->mnt_mountpoint;
+       mountpoint = mnt->mnt_mountpoint;
         path->dentry = mountpoint;
-       path->mnt = parent;
+       path->mnt = &parent->mnt;
         return 1;
  }
  
  int follow_up(struct path *path)
  {
-       struct vfsmount *parent;
+       struct mount *mnt = real_mount(path->mnt);
+       struct mount *parent;
         struct dentry *mountpoint;
  
         br_read_lock(vfsmount_lock);
-       parent = path->mnt->mnt_parent;
-       if (parent == path->mnt) {
+       parent = mnt->mnt_parent;
+       if (&parent->mnt == path->mnt) {
                 br_read_unlock(vfsmount_lock);
                 return 0;
         }
-       mntget(parent);
-       mountpoint = dget(path->mnt->mnt_mountpoint);
+       mntget(&parent->mnt);
+       mountpoint = dget(mnt->mnt_mountpoint);
         br_read_unlock(vfsmount_lock);
         dput(path->dentry);
         path->dentry = mountpoint;
         mntput(path->mnt);
-       path->mnt = parent;
+       path->mnt = &parent->mnt;
         return 1;
  }
  
@@ -721,31 +724,22 @@ static int follow_automount(struct path *path, unsigned flags,
         if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
                 return -EREMOTE;
  
-       /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
-        * and this is the terminal part of the path.
+       /* We don't want to mount if someone's just doing a stat -
+        * unless they're stat'ing a directory and appended a '/' to
+        * the name.
+        *
+        * We do, however, want to mount if someone wants to open or
+        * create a file of any type under the mountpoint, wants to
+        * traverse through the mountpoint or wants to open the
+        * mounted directory.  Also, autofs may mark negative dentries
+        * as being automount points.  These will need the attentions
+        * of the daemon to instantiate them before they can be used.
          */
-       if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
-               return -EISDIR; /* we actually want to stop here */
+       if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
+                    LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
+           path->dentry->d_inode)
+               return -EISDIR;
  
-       /*
-        * We don't want to mount if someone's just doing a stat and they've
-        * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
-        * appended a '/' to the name.
-        */
-       if (!(flags & LOOKUP_FOLLOW)) {
-               /* We do, however, want to mount if someone wants to open or
-                * create a file of any type under the mountpoint, wants to
-                * traverse through the mountpoint or wants to open the mounted
-                * directory.
-                * Also, autofs may mark negative dentries as being automount
-                * points.  These will need the attentions of the daemon to
-                * instantiate them before they can be used.
-                */
-               if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-                            LOOKUP_OPEN | LOOKUP_CREATE)) &&
-                   path->dentry->d_inode)
-                       return -EISDIR;
-       }
         current->total_link_count++;
         if (current->total_link_count >= 40)
                 return -ELOOP;
@@ -859,7 +853,7 @@ static int follow_managed(struct path *path, unsigned flags)
                 mntput(path->mnt);
         if (ret == -EISDIR)
                 ret = 0;
-       return ret;
+       return ret < 0 ? ret : need_mntput;
  }
  
  int follow_down_one(struct path *path)
@@ -891,7 +885,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                                struct inode **inode)
  {
         for (;;) {
-               struct vfsmount *mounted;
+               struct mount *mounted;
                 /*
                  * Don't forget we might have a non-mountpoint managed dentry
                  * that wants to block transit.
@@ -905,8 +899,9 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
                 if (!mounted)
                         break;
-               path->mnt = mounted;
-               path->dentry = mounted->mnt_root;
+               path->mnt = &mounted->mnt;
+               path->dentry = mounted->mnt.mnt_root;
+               nd->flags |= LOOKUP_JUMPED;
                 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
                 /*
                  * Update the inode too. We don't need to re-check the
@@ -921,12 +916,12 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
  static void follow_mount_rcu(struct nameidata *nd)
  {
         while (d_mountpoint(nd->path.dentry)) {
-               struct vfsmount *mounted;
+               struct mount *mounted;
                 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
                 if (!mounted)
                         break;
-               nd->path.mnt = mounted;
-               nd->path.dentry = mounted->mnt_root;
+               nd->path.mnt = &mounted->mnt;
+               nd->path.dentry = mounted->mnt.mnt_root;
                 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
         }
  }
@@ -1059,51 +1054,65 @@ static void follow_dotdot(struct nameidata *nd)
  }
  
  /*
- * Allocate a dentry with name and parent, and perform a parent
- * directory ->lookup on it. Returns the new dentry, or ERR_PTR
- * on error. parent->d_inode->i_mutex must be held. d_lookup must
- * have verified that no child exists while under i_mutex.
+ * This looks up the name in dcache, possibly revalidates the old dentry and
+ * allocates a new one if not found or not valid.  In the need_lookup argument
+ * returns whether i_op->lookup is necessary.
+ *
+ * dir->d_inode->i_mutex must be held
   */
-static struct dentry *d_alloc_and_lookup(struct dentry *parent,
-                               struct qstr *name, struct nameidata *nd)
+static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
+                                   struct nameidata *nd, bool *need_lookup)
  {
-       struct inode *inode = parent->d_inode;
         struct dentry *dentry;
-       struct dentry *old;
+       int error;
  
-       /* Don't create child dentry for a dead directory. */
-       if (unlikely(IS_DEADDIR(inode)))
-               return ERR_PTR(-ENOENT);
+       *need_lookup = false;
+       dentry = d_lookup(dir, name);
+       if (dentry) {
+               if (d_need_lookup(dentry)) {
+                       *need_lookup = true;
+               } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
+                       error = d_revalidate(dentry, nd);
+                       if (unlikely(error <= 0)) {
+                               if (error < 0) {
+                                       dput(dentry);
+                                       return ERR_PTR(error);
+                               } else if (!d_invalidate(dentry)) {
+                                       dput(dentry);
+                                       dentry = NULL;
+                               }
+                       }
+               }
+       }
  
-       dentry = d_alloc(parent, name);
-       if (unlikely(!dentry))
-               return ERR_PTR(-ENOMEM);
+       if (!dentry) {
+               dentry = d_alloc(dir, name);
+               if (unlikely(!dentry))
+                       return ERR_PTR(-ENOMEM);
  
-       old = inode->i_op->lookup(inode, dentry, nd);
-       if (unlikely(old)) {
-               dput(dentry);
-               dentry = old;
+               *need_lookup = true;
         }
         return dentry;
  }
  
  /*
- * We already have a dentry, but require a lookup to be performed on the parent
- * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error.
- * parent->d_inode->i_mutex must be held. d_lookup must have verified that no
- * child exists while under i_mutex.
+ * Call i_op->lookup on the dentry.  The dentry must be negative but may be
+ * hashed if it was pouplated with DCACHE_NEED_LOOKUP.
+ *
+ * dir->d_inode->i_mutex must be held
   */
-static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry,
-                                    struct nameidata *nd)
+static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
+                                 struct nameidata *nd)
  {
-       struct inode *inode = parent->d_inode;
         struct dentry *old;
  
         /* Don't create child dentry for a dead directory. */
-       if (unlikely(IS_DEADDIR(inode)))
+       if (unlikely(IS_DEADDIR(dir))) {
+               dput(dentry);
                 return ERR_PTR(-ENOENT);
+       }
  
-       old = inode->i_op->lookup(inode, dentry, nd);
+       old = dir->i_op->lookup(dir, dentry, nd);
         if (unlikely(old)) {
                 dput(dentry);
                 dentry = old;
@@ -1111,6 +1120,19 @@ static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentr
         return dentry;
  }
  
+static struct dentry *__lookup_hash(struct qstr *name,
+               struct dentry *base, struct nameidata *nd)
+{
+       bool need_lookup;
+       struct dentry *dentry;
+
+       dentry = lookup_dcache(name, base, nd, &need_lookup);
+       if (!need_lookup)
+               return dentry;
+
+       return lookup_real(base->d_inode, dentry, nd);
+}
+
  /*
   *  It's more convoluted than I'd like it to be, but... it's still fairly
   *  small and for now I'd prefer to have fast path as straight as possible.
@@ -1142,6 +1164,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                         return -ECHILD;
                 nd->seq = seq;
  
+               if (unlikely(d_need_lookup(dentry)))
+                       goto unlazy;
                 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
                         status = d_revalidate(dentry, nd);
                         if (unlikely(status <= 0)) {
@@ -1150,8 +1174,6 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                                 goto unlazy;
                         }
                 }
-               if (unlikely(d_need_lookup(dentry)))
-                       goto unlazy;
                 path->mnt = mnt;
                 path->dentry = dentry;
                 if (unlikely(!__follow_mount_rcu(nd, path, inode)))
@@ -1166,38 +1188,14 @@ unlazy:
                 dentry = __d_lookup(parent, name);
         }
  
-       if (dentry && unlikely(d_need_lookup(dentry))) {
+       if (unlikely(!dentry))
+               goto need_lookup;
+
+       if (unlikely(d_need_lookup(dentry))) {
                 dput(dentry);
-               dentry = NULL;
-       }
-retry:
-       if (unlikely(!dentry)) {
-               struct inode *dir = parent->d_inode;
-               BUG_ON(nd->inode != dir);
-
-               mutex_lock(&dir->i_mutex);
-               dentry = d_lookup(parent, name);
-               if (likely(!dentry)) {
-                       dentry = d_alloc_and_lookup(parent, name, nd);
-                       if (IS_ERR(dentry)) {
-                               mutex_unlock(&dir->i_mutex);
-                               return PTR_ERR(dentry);
-                       }
-                       /* known good */
-                       need_reval = 0;
-                       status = 1;
-               } else if (unlikely(d_need_lookup(dentry))) {
-                       dentry = d_inode_lookup(parent, dentry, nd);
-                       if (IS_ERR(dentry)) {
-                               mutex_unlock(&dir->i_mutex);
-                               return PTR_ERR(dentry);
-                       }
-                       /* known good */
-                       need_reval = 0;
-                       status = 1;
-               }
-               mutex_unlock(&dir->i_mutex);
+               goto need_lookup;
         }
+
         if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
                 status = d_revalidate(dentry, nd);
         if (unlikely(status <= 0)) {
@@ -1207,12 +1205,10 @@ retry:
                 }
                 if (!d_invalidate(dentry)) {
                         dput(dentry);
-                       dentry = NULL;
-                       need_reval = 1;
-                       goto retry;
+                       goto need_lookup;
                 }
         }
-
+done:
         path->mnt = mnt;
         path->dentry = dentry;
         err = follow_managed(path, nd->flags);
@@ -1220,8 +1216,20 @@ retry:
                 path_put_conditional(path, nd);
                 return err;
         }
+       if (err)
+               nd->flags |= LOOKUP_JUMPED;
         *inode = path->dentry->d_inode;
         return 0;
+
+need_lookup:
+       BUG_ON(nd->inode != parent->d_inode);
+
+       mutex_lock(&parent->d_inode->i_mutex);
+       dentry = __lookup_hash(name, parent, nd);
+       mutex_unlock(&parent->d_inode->i_mutex);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+       goto done;
  }
  
  static inline int may_lookup(struct nameidata *nd)
@@ -1376,6 +1384,128 @@ static inline int can_lookup(struct inode *inode)
  }
  
  /*
+ * We can do the critical dentry name comparison and hashing
+ * operations one word at a time, but we are limited to:
+ *
+ * - Architectures with fast unaligned word accesses. We could
+ *   do a "get_unaligned()" if this helps and is sufficiently
+ *   fast.
+ *
+ * - Little-endian machines (so that we can generate the mask
+ *   of low bytes efficiently). Again, we *could* do a byte
+ *   swapping load on big-endian architectures if that is not
+ *   expensive enough to make the optimization worthless.
+ *
+ * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
+ *   do not trap on the (extremely unlikely) case of a page
+ *   crossing operation.
+ *
+ * - Furthermore, we need an efficient 64-bit compile for the
+ *   64-bit case in order to generate the "number of bytes in
+ *   the final mask". Again, that could be replaced with a
+ *   efficient population count instruction or similar.
+ */
+#ifdef CONFIG_DCACHE_WORD_ACCESS
+
+#include <asm/word-at-a-time.h>
+
+#ifdef CONFIG_64BIT
+
+static inline unsigned int fold_hash(unsigned long hash)
+{
+       hash += hash >> (8*sizeof(int));
+       return hash;
+}
+
+#else  /* 32-bit case */
+
+#define fold_hash(x) (x)
+
+#endif
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+       unsigned long a, mask;
+       unsigned long hash = 0;
+
+       for (;;) {
+               a = load_unaligned_zeropad(name);
+               if (len < sizeof(unsigned long))
+                       break;
+               hash += a;
+               hash *= 9;
+               name += sizeof(unsigned long);
+               len -= sizeof(unsigned long);
+               if (!len)
+                       goto done;
+       }
+       mask = ~(~0ul << len*8);
+       hash += mask & a;
+done:
+       return fold_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+/*
+ * Calculate the length and hash of the path component, and
+ * return the length of the component;
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+       unsigned long a, mask, hash, len;
+
+       hash = a = 0;
+       len = -sizeof(unsigned long);
+       do {
+               hash = (hash + a) * 9;
+               len += sizeof(unsigned long);
+               a = load_unaligned_zeropad(name+len);
+               /* Do we have any NUL or '/' bytes in this word? */
+               mask = has_zero(a) | has_zero(a ^ REPEAT_BYTE('/'));
+       } while (!mask);
+
+       /* The mask *below* the first high bit set */
+       mask = (mask - 1) & ~mask;
+       mask >>= 7;
+       hash += a & mask;
+       *hashp = fold_hash(hash);
+
+       return len + count_masked_bytes(mask);
+}
+
+#else
+
+unsigned int full_name_hash(const unsigned char *name, unsigned int len)
+{
+       unsigned long hash = init_name_hash();
+       while (len--)
+               hash = partial_name_hash(*name++, hash);
+       return end_name_hash(hash);
+}
+EXPORT_SYMBOL(full_name_hash);
+
+/*
+ * We know there's a real path component here of at least
+ * one character.
+ */
+static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+{
+       unsigned long hash = init_name_hash();
+       unsigned long len = 0, c;
+
+       c = (unsigned char)*name;
+       do {
+               len++;
+               hash = partial_name_hash(c, hash);
+               c = (unsigned char)name[len];
+       } while (c && c != '/');
+       *hashp = end_name_hash(hash);
+       return len;
+}
+
+#endif
+
+/*
   * Name resolution.
   * This is the basic name resolution function, turning a pathname into
   * the final dentry. We expect 'base' to be positive and a directory.
@@ -1395,31 +1525,22 @@ static int link_path_walk(const char *name, struct nameidata *nd)
  
         /* At this point we know we have a real path component. */
         for(;;) {
-               unsigned long hash;
                 struct qstr this;
-               unsigned int c;
+               long len;
                 int type;
  
                 err = may_lookup(nd);
                 if (err)
                         break;
  
+               len = hash_name(name, &this.hash);
                 this.name = name;
-               c = *(const unsigned char *)name;
-
-               hash = init_name_hash();
-               do {
-                       name++;
-                       hash = partial_name_hash(c, hash);
-                       c = *(const unsigned char *)name;
-               } while (c && (c != '/'));
-               this.len = name - (const char *) this.name;
-               this.hash = end_name_hash(hash);
+               this.len = len;
  
                 type = LAST_NORM;
-               if (this.name[0] == '.') switch (this.len) {
+               if (name[0] == '.') switch (len) {
                         case 2:
-                               if (this.name[1] == '.') {
+                               if (name[1] == '.') {
                                         type = LAST_DOTDOT;
                                         nd->flags |= LOOKUP_JUMPED;
                                 }
@@ -1438,12 +1559,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
                         }
                 }
  
-               /* remove trailing slashes? */
-               if (!c)
+               if (!name[len])
                         goto last_component;
-               while (*++name == '/');
-               if (!*name)
+               /*
+                * If it wasn't NUL, we know it was '/'. Skip that
+                * slash, and continue until no more slashes.
+                */
+               do {
+                       len++;
+               } while (unlikely(name[len] == '/'));
+               if (!name[len])
                         goto last_component;
+               name += len;
  
                 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
                 if (err < 0)
@@ -1699,59 +1826,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
         return err;
  }
  
-static struct dentry *__lookup_hash(struct qstr *name,
-               struct dentry *base, struct nameidata *nd)
-{
-       struct inode *inode = base->d_inode;
-       struct dentry *dentry;
-       int err;
-
-       err = inode_permission(inode, MAY_EXEC);
-       if (err)
-               return ERR_PTR(err);
-
-       /*
-        * Don't bother with __d_lookup: callers are for creat as
-        * well as unlink, so a lot of the time it would cost
-        * a double lookup.
-        */
-       dentry = d_lookup(base, name);
-
-       if (dentry && d_need_lookup(dentry)) {
-               /*
-                * __lookup_hash is called with the parent dir's i_mutex already
-                * held, so we are good to go here.
-                */
-               dentry = d_inode_lookup(base, dentry, nd);
-               if (IS_ERR(dentry))
-                       return dentry;
-       }
-
-       if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-               int status = d_revalidate(dentry, nd);
-               if (unlikely(status <= 0)) {
-                       /*
-                        * The dentry failed validation.
-                        * If d_revalidate returned 0 attempt to invalidate
-                        * the dentry otherwise d_revalidate is asking us
-                        * to return a fail status.
-                        */
-                       if (status < 0) {
-                               dput(dentry);
-                               return ERR_PTR(status);
-                       } else if (!d_invalidate(dentry)) {
-                               dput(dentry);
-                               dentry = NULL;
-                       }
-               }
-       }
-
-       if (!dentry)
-               dentry = d_alloc_and_lookup(base, name, nd);
-
-       return dentry;
-}
-
  /*
   * Restricted form of lookup. Doesn't follow links, single-component only,
   * needs parent already locked. Doesn't follow mounts.
@@ -1776,24 +1850,22 @@ static struct dentry *lookup_hash(struct nameidata *nd)
  struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
  {
         struct qstr this;
-       unsigned long hash;
         unsigned int c;
+       int err;
  
         WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
  
         this.name = name;
         this.len = len;
+       this.hash = full_name_hash(name, len);
         if (!len)
                 return ERR_PTR(-EACCES);
  
-       hash = init_name_hash();
         while (len--) {
                 c = *(const unsigned char *)name++;
                 if (c == '/' || c == '\0')
                         return ERR_PTR(-EACCES);
-               hash = partial_name_hash(c, hash);
         }
-       this.hash = end_name_hash(hash);
         /*
          * See if the low-level filesystem might want
          * to use its own hash..
@@ -1804,14 +1876,18 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
                         return ERR_PTR(err);
         }
  
+       err = inode_permission(base->d_inode, MAY_EXEC);
+       if (err)
+               return ERR_PTR(err);
+
         return __lookup_hash(&this, base, NULL);
  }
  
-int user_path_at(int dfd, const char __user *name, unsigned flags,
-                struct path *path)
+int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
+                struct path *path, int *empty)
  {
         struct nameidata nd;
-       char *tmp = getname_flags(name, flags);
+       char *tmp = getname_flags(name, flags, empty);
         int err = PTR_ERR(tmp);
         if (!IS_ERR(tmp)) {
  
@@ -1825,6 +1901,12 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
         return err;
  }
  
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+                struct path *path)
+{
+       return user_path_at_empty(dfd, name, flags, path, NULL);
+}
+
  static int user_path_parent(int dfd, const char __user *path,
                         struct nameidata *nd, char **name)
  {
@@ -1865,6 +1947,26 @@ other_userns:
  }
  
  /*
+ * Do the directory specific tests of inode_permission() and call the
+ * may_delete inode operation.  The may_delete inode operation must do the
+ * sticky check when needed.
+ */
+static int may_delete_iop(struct inode *dir, struct inode *inode, int replace)
+{
+       int error;
+
+       if (IS_RDONLY(dir))
+               return -EROFS;
+       if (IS_IMMUTABLE(dir))
+               return -EACCES;
+       error = dir->i_op->may_delete(dir, inode, replace);
+       if (!error)
+               error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
+
+       return error;
+}
+
+/*
   *     Check whether we can remove a link victim from directory dir, check
   *  whether the type of victim is right.
   *  1. We can't do it if dir is read-only (done in permission())
@@ -1883,7 +1985,8 @@ other_userns:
   * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
   *     nfs_async_unlink().
   */
-static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int may_delete(struct inode *dir, struct dentry *victim,
+                     int isdir, int replace)
  {
         int error;
  
@@ -1892,14 +1995,19 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
  
         BUG_ON(victim->d_parent->d_inode != dir);
         audit_inode_child(victim, dir);
-
-       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       if (dir->i_op->may_delete)
+               error = may_delete_iop(dir, victim->d_inode, replace);
+       else {
+               error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+               if (!error && check_sticky(dir, victim->d_inode))
+                       error = -EPERM;
+       }
         if (error)
                 return error;
         if (IS_APPEND(dir))
                 return -EPERM;
-       if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
-           IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+       if (IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) ||
+               IS_SWAPFILE(victim->d_inode))
                 return -EPERM;
         if (isdir) {
                 if (!S_ISDIR(victim->d_inode->i_mode))
@@ -1915,6 +2023,25 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
         return 0;
  }
  
+/*
+ * Do the directory specific tests of inode_permission() and call the
+ * may_create inode operation.
+ */
+static int may_create_iop(struct inode *dir, int isdir)
+{
+       int error;
+
+       if (IS_RDONLY(dir))
+               return -EROFS;
+       if (IS_IMMUTABLE(dir))
+               return -EACCES;
+       error = dir->i_op->may_create(dir, isdir);
+       if (!error)
+               error = security_inode_permission(dir, MAY_WRITE | MAY_EXEC);
+
+       return error;
+}
+
  /*     Check whether we can create an object with dentry child in directory
   *  dir.
   *  1. We can't do it if child already exists (open has special treatment for
@@ -1923,13 +2050,16 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
   *  3. We should have write and exec permissions on dir
   *  4. We can't do it if dir is immutable (done in permission())
   */
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct inode *dir, struct dentry *child, int isdir)
  {
         if (child->d_inode)
                 return -EEXIST;
         if (IS_DEADDIR(dir))
                 return -ENOENT;
-       return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       if (dir->i_op->may_create)
+               return may_create_iop(dir, isdir);
+       else
+               return inode_permission(dir, MAY_WRITE | MAY_EXEC);
  }
  
  /*
@@ -1974,10 +2104,10 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
         }
  }
  
-int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                 struct nameidata *nd)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(dir, dentry, 0);
  
         if (error)
                 return error;
@@ -2044,10 +2174,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
         if (flag & O_NOATIME && !inode_owner_or_capable(inode))
                 return -EPERM;
  
-       /*
-        * Ensure there are no outstanding leases on the file.
-        */
-       return break_lease(inode, flag);
+       return 0;
  }
  
  static int handle_truncate(struct file *filp)
@@ -2138,7 +2265,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                 /* sayonara */
                 error = complete_walk(nd);
                 if (error)
-                       return ERR_PTR(-ECHILD);
+                       return ERR_PTR(error);
  
                 error = -ENOTDIR;
                 if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2150,6 +2277,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
         }
  
         /* create side of things */
+       /*
+        * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED has been
+        * cleared when we got to the last component we are about to look up
+        */
         error = complete_walk(nd);
         if (error)
                 return ERR_PTR(error);
@@ -2174,8 +2305,8 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
  
         /* Negative dentry, just create the file */
         if (!dentry->d_inode) {
-               int mode = op->mode;
-               if (!IS_POSIXACL(dir->d_inode))
+               umode_t mode = op->mode;
+               if (!IS_ACL(dir->d_inode))
                         mode &= ~current_umask();
                 /*
                  * This write is needed to ensure that a
@@ -2218,6 +2349,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
         if (error < 0)
                 goto exit_dput;
  
+       if (error)
+               nd->flags |= LOOKUP_JUMPED;
+
         error = -ENOENT;
         if (!path->dentry->d_inode)
                 goto exit_dput;
@@ -2227,6 +2361,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
  
         path_to_nameidata(path, nd);
         nd->inode = path->dentry->d_inode;
+       /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
+       error = complete_walk(nd);
+       if (error)
+               return ERR_PTR(error);
         error = -EISDIR;
         if (S_ISDIR(nd->inode->i_mode))
                 goto exit;
@@ -2434,9 +2572,9 @@ struct dentry *user_path_create(int dfd, const char __user *pathname, struct pat
  }
  EXPORT_SYMBOL(user_path_create);
  
-int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(dir, dentry, 0);
  
         if (error)
                 return error;
@@ -2462,7 +2600,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
         return error;
  }
  
-static int may_mknod(mode_t mode)
+static int may_mknod(umode_t mode)
  {
         switch (mode & S_IFMT) {
         case S_IFREG:
@@ -2479,7 +2617,7 @@ static int may_mknod(mode_t mode)
         }
  }
  
-SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
+SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                 unsigned, dev)
  {
         struct dentry *dentry;
@@ -2493,7 +2631,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
         if (IS_ERR(dentry))
                 return PTR_ERR(dentry);
  
-       if (!IS_POSIXACL(path.dentry->d_inode))
+       if (!IS_ACL(path.dentry->d_inode))
                 mode &= ~current_umask();
         error = may_mknod(mode);
         if (error)
@@ -2526,14 +2664,15 @@ out_dput:
         return error;
  }
  
-SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
+SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
  {
         return sys_mknodat(AT_FDCWD, filename, mode, dev);
  }
  
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(dir, dentry, 1);
+       unsigned max_links = dir->i_sb->s_max_links;
  
         if (error)
                 return error;
@@ -2546,13 +2685,16 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
         if (error)
                 return error;
  
+       if (max_links && dir->i_nlink >= max_links)
+               return -EMLINK;
+
         error = dir->i_op->mkdir(dir, dentry, mode);
         if (!error)
                 fsnotify_mkdir(dir, dentry);
         return error;
  }
  
-SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
+SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
  {
         struct dentry *dentry;
         struct path path;
@@ -2562,7 +2704,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
         if (IS_ERR(dentry))
                 return PTR_ERR(dentry);
  
-       if (!IS_POSIXACL(path.dentry->d_inode))
+       if (!IS_ACL(path.dentry->d_inode))
                 mode &= ~current_umask();
         error = mnt_want_write(path.mnt);
         if (error)
@@ -2580,14 +2722,14 @@ out_dput:
         return error;
  }
  
-SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
+SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
  {
         return sys_mkdirat(AT_FDCWD, pathname, mode);
  }
  
  /*
   * The dentry_unhash() helper will try to drop the dentry early: we
- * should have a usage count of 2 if we're the only user of this
+ * should have a usage count of 1 if we're the only user of this
   * dentry, and if that is true (possibly after pruning the dcache),
   * then we drop the dentry now.
   *
@@ -2611,7 +2753,7 @@ void dentry_unhash(struct dentry *dentry)
  
  int vfs_rmdir(struct inode *dir, struct dentry *dentry)
  {
-       int error = may_delete(dir, dentry, 1);
+       int error = may_delete(dir, dentry, 1, 0);
  
         if (error)
                 return error;
@@ -2619,6 +2761,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
         if (!dir->i_op->rmdir)
                 return -EPERM;
  
+       dget(dentry);
         mutex_lock(&dentry->d_inode->i_mutex);
  
         error = -EBUSY;
@@ -2639,6 +2782,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
  
  out:
         mutex_unlock(&dentry->d_inode->i_mutex);
+       dput(dentry);
         if (!error)
                 d_delete(dentry);
         return error;
@@ -2704,7 +2848,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
  
  int vfs_unlink(struct inode *dir, struct dentry *dentry)
  {
-       int error = may_delete(dir, dentry, 0);
+       int error = may_delete(dir, dentry, 0, 0);
  
         if (error)
                 return error;
@@ -2813,7 +2957,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
  
  int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(dir, dentry, 0);
  
         if (error)
                 return error;
@@ -2874,12 +3018,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
  {
         struct inode *inode = old_dentry->d_inode;
+       unsigned max_links = dir->i_sb->s_max_links;
         int error;
  
         if (!inode)
                 return -ENOENT;
  
-       error = may_create(dir, new_dentry);
+       error = may_create(dir, new_dentry, S_ISDIR(inode->i_mode));
         if (error)
                 return error;
  
@@ -2904,6 +3049,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
         /* Make sure we don't allow creating hardlink to an unlinked file */
         if (inode->i_nlink == 0)
                 error =  -ENOENT;
+       else if (max_links && inode->i_nlink >= max_links)
+               error = -EMLINK;
         else
                 error = dir->i_op->link(old_dentry, dir, new_dentry);
         mutex_unlock(&inode->i_mutex);
@@ -3013,6 +3160,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
  {
         int error = 0;
         struct inode *target = new_dentry->d_inode;
+       unsigned max_links = new_dir->i_sb->s_max_links;
  
         /*
          * If we are going to change the parent - check write permissions,
@@ -3028,6 +3176,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
         if (error)
                 return error;
  
+       dget(new_dentry);
         if (target)
                 mutex_lock(&target->i_mutex);
  
@@ -3035,6 +3184,11 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
         if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
                 goto out;
  
+       error = -EMLINK;
+       if (max_links && !target && new_dir != old_dir &&
+           new_dir->i_nlink >= max_links)
+               goto out;
+
         if (target)
                 shrink_dcache_parent(new_dentry);
         error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3048,6 +3202,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
  out:
         if (target)
                 mutex_unlock(&target->i_mutex);
+       dput(new_dentry);
         if (!error)
                 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                         d_move(old_dentry,new_dentry);
@@ -3097,14 +3252,14 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (old_dentry->d_inode == new_dentry->d_inode)
                 return 0;
   
-       error = may_delete(old_dir, old_dentry, is_dir);
+       error = may_delete(old_dir, old_dentry, is_dir, 0);
         if (error)
                 return error;
  
         if (!new_dentry->d_inode)
-               error = may_create(new_dir, new_dentry);
+               error = may_create(new_dir, new_dentry, is_dir);
         else
-               error = may_delete(new_dir, new_dentry, is_dir);
+               error = may_delete(new_dir, new_dentry, is_dir, 1);
         if (error)
                 return error;
  
@@ -3332,9 +3487,9 @@ retry:
         if (err)
                 goto fail;
  
-       kaddr = kmap_atomic(page, KM_USER0);
+       kaddr = kmap_atomic(page);
         memcpy(kaddr, symname, len-1);
-       kunmap_atomic(kaddr, KM_USER0);
+       kunmap_atomic(kaddr);
  
         err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
                                                         page, fsdata);