Update to 3.4-final.

[linux-flexiantxendom0-3.2.10.git] / mm / slab.c
diff --git a/mm/slab.c b/mm/slab.c

index d96e223..e901a36 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -121,6 +121,8 @@
  #include       <asm/tlbflush.h>
  #include       <asm/page.h>
  
+#include <trace/events/kmem.h>
+
  /*
   * DEBUG       - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
   *               0 for faster, smaller code (especially in the critical paths).
@@ -479,11 +481,13 @@ EXPORT_SYMBOL(slab_buffer_size);
  #endif
  
  /*
- * Do not go above this order unless 0 objects fit into the slab.
+ * Do not go above this order unless 0 objects fit into the slab or
+ * overridden on the command line.
   */
-#define        BREAK_GFP_ORDER_HI      1
-#define        BREAK_GFP_ORDER_LO      0
-static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
+#define        SLAB_MAX_ORDER_HI       1
+#define        SLAB_MAX_ORDER_LO       0
+static int slab_max_order = SLAB_MAX_ORDER_LO;
+static bool slab_max_order_set __initdata;
  
  /*
   * Functions for storing/retrieving the cachep and or slab from the page
@@ -574,7 +578,9 @@ static struct arraycache_init initarray_generic =
      { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
  
  /* internal cache of cache description objs */
+static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
  static struct kmem_cache cache_cache = {
+       .nodelists = cache_cache_nodelists,
         .batchcount = 1,
         .limit = BOOT_CPUCACHE_ENTRIES,
         .shared = 1,
@@ -593,6 +599,7 @@ static enum {
         PARTIAL_AC,
         PARTIAL_L3,
         EARLY,
+       LATE,
         FULL
  } g_cpucache_up;
  
@@ -620,37 +627,67 @@ int slab_is_available(void)
  static struct lock_class_key on_slab_l3_key;
  static struct lock_class_key on_slab_alc_key;
  
+static struct lock_class_key debugobj_l3_key;
+static struct lock_class_key debugobj_alc_key;
+
+static void slab_set_lock_classes(struct kmem_cache *cachep,
+               struct lock_class_key *l3_key, struct lock_class_key *alc_key,
+               int q)
+{
+       struct array_cache **alc;
+       struct kmem_list3 *l3;
+       int r;
+
+       l3 = cachep->nodelists[q];
+       if (!l3)
+               return;
+
+       lockdep_set_class(&l3->list_lock, l3_key);
+       alc = l3->alien;
+       /*
+        * FIXME: This check for BAD_ALIEN_MAGIC
+        * should go away when common slab code is taught to
+        * work even without alien caches.
+        * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+        * for alloc_alien_cache,
+        */
+       if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+               return;
+       for_each_node(r) {
+               if (alc[r])
+                       lockdep_set_class(&alc[r]->lock, alc_key);
+       }
+}
+
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+       slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
+}
+
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+       int node;
+
+       for_each_online_node(node)
+               slab_set_debugobj_lock_classes_node(cachep, node);
+}
+
  static void init_node_lock_keys(int q)
  {
         struct cache_sizes *s = malloc_sizes;
  
-       if (g_cpucache_up != FULL)
+       if (g_cpucache_up < LATE)
                 return;
  
         for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
-               struct array_cache **alc;
                 struct kmem_list3 *l3;
-               int r;
  
                 l3 = s->cs_cachep->nodelists[q];
                 if (!l3 || OFF_SLAB(s->cs_cachep))
                         continue;
-               lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
-               alc = l3->alien;
-               /*
-                * FIXME: This check for BAD_ALIEN_MAGIC
-                * should go away when common slab code is taught to
-                * work even without alien caches.
-                * Currently, non NUMA code returns BAD_ALIEN_MAGIC
-                * for alloc_alien_cache,
-                */
-               if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
-                       continue;
-               for_each_node(r) {
-                       if (alc[r])
-                               lockdep_set_class(&alc[r]->lock,
-                                       &on_slab_alc_key);
-               }
+
+               slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
+                               &on_slab_alc_key, q);
         }
  }
  
@@ -669,6 +706,14 @@ static void init_node_lock_keys(int q)
  static inline void init_lock_keys(void)
  {
  }
+
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
+
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+}
  #endif
  
  /*
@@ -811,6 +856,17 @@ static int __init noaliencache_setup(char *s)
  }
  __setup("noaliencache", noaliencache_setup);
  
+static int __init slab_max_order_setup(char *str)
+{
+       get_option(&str, &slab_max_order);
+       slab_max_order = slab_max_order < 0 ? 0 :
+                               min(slab_max_order, MAX_ORDER - 1);
+       slab_max_order_set = true;
+
+       return 1;
+}
+__setup("slab_max_order=", slab_max_order_setup);
+
  #ifdef CONFIG_NUMA
  /*
   * Special reaping functions for NUMA systems called from cache_reap().
@@ -1262,6 +1318,8 @@ static int __cpuinit cpuup_prepare(long cpu)
                 spin_unlock_irq(&l3->list_lock);
                 kfree(shared);
                 free_alien_cache(alien);
+               if (cachep->flags & SLAB_DEBUG_OBJECTS)
+                       slab_set_debugobj_lock_classes_node(cachep, node);
         }
         init_node_lock_keys(node);
  
@@ -1457,10 +1515,11 @@ void __init kmem_cache_init(void)
  
         /*
          * Fragmentation resistance on low memory - only use bigger
-        * page orders on machines with more than 32MB of memory.
+        * page orders on machines with more than 32MB of memory if
+        * not overridden on the command line.
          */
-       if (totalram_pages > (32 << 20) >> PAGE_SHIFT)
-               slab_break_gfp_order = BREAK_GFP_ORDER_HI;
+       if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+               slab_max_order = SLAB_MAX_ORDER_HI;
  
         /* Bootstrap is tricky, because several objects are allocated
          * from caches that do not exist yet:
@@ -1492,11 +1551,10 @@ void __init kmem_cache_init(void)
         cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
  
         /*
-        * struct kmem_cache size depends on nr_node_ids, which
-        * can be less than MAX_NUMNODES.
+        * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
          */
-       cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) +
-                                nr_node_ids * sizeof(struct kmem_list3 *);
+       cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+                                 nr_node_ids * sizeof(struct kmem_list3 *);
  #if DEBUG
         cache_cache.obj_size = cache_cache.buffer_size;
  #endif
@@ -1625,6 +1683,11 @@ void __init kmem_cache_init_late(void)
  {
         struct kmem_cache *cachep;
  
+       g_cpucache_up = LATE;
+
+       /* Annotate slab for lockdep -- annotate the malloc caches */
+       init_lock_keys();
+
         /* 6) resize the head arrays to their final sizes */
         mutex_lock(&cache_chain_mutex);
         list_for_each_entry(cachep, &cache_chain, next)
@@ -1635,9 +1698,6 @@ void __init kmem_cache_init_late(void)
         /* Done! */
         g_cpucache_up = FULL;
  
-       /* Annotate slab for lockdep -- annotate the malloc caches */
-       init_lock_keys();
-
         /*
          * Register a cpu startup notifier callback that initializes
          * cpu_cache_get for all new cpus
@@ -1671,6 +1731,52 @@ static int __init cpucache_init(void)
  }
  __initcall(cpucache_init);
  
+static noinline void
+slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
+{
+       struct kmem_list3 *l3;
+       struct slab *slabp;
+       unsigned long flags;
+       int node;
+
+       printk(KERN_WARNING
+               "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+               nodeid, gfpflags);
+       printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
+               cachep->name, cachep->buffer_size, cachep->gfporder);
+
+       for_each_online_node(node) {
+               unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
+               unsigned long active_slabs = 0, num_slabs = 0;
+
+               l3 = cachep->nodelists[node];
+               if (!l3)
+                       continue;
+
+               spin_lock_irqsave(&l3->list_lock, flags);
+               list_for_each_entry(slabp, &l3->slabs_full, list) {
+                       active_objs += cachep->num;
+                       active_slabs++;
+               }
+               list_for_each_entry(slabp, &l3->slabs_partial, list) {
+                       active_objs += slabp->inuse;
+                       active_slabs++;
+               }
+               list_for_each_entry(slabp, &l3->slabs_free, list)
+                       num_slabs++;
+
+               free_objects += l3->free_objects;
+               spin_unlock_irqrestore(&l3->list_lock, flags);
+
+               num_slabs += active_slabs;
+               num_objs = num_slabs * cachep->num;
+               printk(KERN_WARNING
+                       "  node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
+                       node, active_slabs, num_slabs, active_objs, num_objs,
+                       free_objects);
+       }
+}
+
  /*
   * Interface to system's page allocator. No need to hold the cache-lock.
   *
@@ -1697,8 +1803,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                 flags |= __GFP_RECLAIMABLE;
  
         page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
-       if (!page)
+       if (!page) {
+               if (!(flags & __GFP_NOWARN) && printk_ratelimit())
+                       slab_out_of_memory(cachep, flags, nodeid);
                 return NULL;
+       }
  
         nr_pages = (1 << cachep->gfporder);
         if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1810,15 +1919,15 @@ static void dump_line(char *data, int offset, int limit)
         unsigned char error = 0;
         int bad_count = 0;
  
-       printk(KERN_ERR "%03x:", offset);
+       printk(KERN_ERR "%03x: ", offset);
         for (i = 0; i < limit; i++) {
                 if (data[offset + i] != POISON_FREE) {
                         error = data[offset + i];
                         bad_count++;
                 }
-               printk(" %02x", (unsigned char)data[offset + i]);
         }
-       printk("\n");
+       print_hex_dump(KERN_CONT, "", 0, 16, 1,
+                       &data[offset], limit, 1);
  
         if (bad_count == 1) {
                 error ^= POISON_FREE;
@@ -1886,8 +1995,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
                         /* Print header */
                         if (lines == 0) {
                                 printk(KERN_ERR
-                                       "Slab corruption: %s start=%p, len=%d\n",
-                                       cachep->name, realobj, size);
+                                       "Slab corruption (%s): %s start=%p, len=%d\n",
+                                       print_tainted(), cachep->name, realobj, size);
                                 print_objinfo(cachep, objp, 0);
                         }
                         /* Hexdump the affected line */
@@ -2071,7 +2180,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
                  * Large number of objects is good, but very large slabs are
                  * currently bad for the gfp()s.
                  */
-               if (gfporder >= slab_break_gfp_order)
+               if (gfporder >= slab_max_order)
                         break;
  
                 /*
@@ -2308,6 +2417,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         if (!cachep)
                 goto oops;
  
+       cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
  #if DEBUG
         cachep->obj_size = size;
  
@@ -2424,6 +2534,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 goto oops;
         }
  
+       if (flags & SLAB_DEBUG_OBJECTS) {
+               /*
+                * Would deadlock through slab_destroy()->call_rcu()->
+                * debug_object_activate()->kmem_cache_alloc().
+                */
+               WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
+
+               slab_set_debugobj_lock_classes(cachep);
+       }
+
         /* cache setup completed, link it into the list */
         list_add(&cachep->next, &cache_chain);
  oops:
@@ -2985,16 +3105,12 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
         if (entries != cachep->num - slabp->inuse) {
  bad:
                 printk(KERN_ERR "slab: Internal list corruption detected in "
-                               "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-                       cachep->name, cachep->num, slabp, slabp->inuse);
-               for (i = 0;
-                    i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
-                    i++) {
-                       if (i % 16 == 0)
-                               printk("\n%03x:", i);
-                       printk(" %02x", ((unsigned char *)slabp)[i]);
-               }
-               printk("\n");
+                       "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
+                       cachep->name, cachep->num, slabp, slabp->inuse,
+                       print_tainted());
+               print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
+                       sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
+                       1);
                 BUG();
         }
  }
@@ -3153,12 +3269,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
         objp += obj_offset(cachep);
         if (cachep->ctor && cachep->flags & SLAB_POISON)
                 cachep->ctor(objp);
-#if ARCH_SLAB_MINALIGN
-       if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
+       if (ARCH_SLAB_MINALIGN &&
+           ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
                 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
-                      objp, ARCH_SLAB_MINALIGN);
+                      objp, (int)ARCH_SLAB_MINALIGN);
         }
-#endif
         return objp;
  }
  #else
@@ -3218,12 +3333,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
         if (in_interrupt() || (flags & __GFP_THISNODE))
                 return NULL;
         nid_alloc = nid_here = numa_mem_id();
-       get_mems_allowed();
         if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                 nid_alloc = cpuset_slab_spread_node();
         else if (current->mempolicy)
                 nid_alloc = slab_node(current->mempolicy);
-       put_mems_allowed();
         if (nid_alloc != nid_here)
                 return ____cache_alloc_node(cachep, flags, nid_alloc);
         return NULL;
@@ -3246,14 +3359,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
         enum zone_type high_zoneidx = gfp_zone(flags);
         void *obj = NULL;
         int nid;
+       unsigned int cpuset_mems_cookie;
  
         if (flags & __GFP_THISNODE)
                 return NULL;
  
-       get_mems_allowed();
-       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
         local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
  
+retry_cpuset:
+       cpuset_mems_cookie = get_mems_allowed();
+       zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+
  retry:
         /*
          * Look through allowed nodes for objects available
@@ -3306,7 +3422,9 @@ retry:
                         }
                 }
         }
-       put_mems_allowed();
+
+       if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+               goto retry_cpuset;
         return obj;
  }
  
@@ -3402,7 +3520,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
         cache_alloc_debugcheck_before(cachep, flags);
         local_irq_save(save_flags);
  
-       if (nodeid == -1)
+       if (nodeid == NUMA_NO_NODE)
                 nodeid = slab_node;
  
         if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3627,13 +3745,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
  
         if (likely(ac->avail < ac->limit)) {
                 STATS_INC_FREEHIT(cachep);
-               ac->entry[ac->avail++] = objp;
-               return;
         } else {
                 STATS_INC_FREEMISS(cachep);
                 cache_flusharray(cachep, ac);
-               ac->entry[ac->avail++] = objp;
         }
+
+       ac->entry[ac->avail++] = objp;
  }
  
  /**
@@ -3933,7 +4050,7 @@ fail:
  
  struct ccupdate_struct {
         struct kmem_cache *cachep;
-       struct array_cache *new[NR_CPUS];
+       struct array_cache *new[0];
  };
  
  static void do_ccupdate_local(void *info)
@@ -3955,7 +4072,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
         struct ccupdate_struct *new;
         int i;
  
-       new = kzalloc(sizeof(*new), gfp);
+       new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
+                     gfp);
         if (!new)
                 return -ENOMEM;
  
@@ -4532,7 +4650,7 @@ static const struct file_operations proc_slabstats_operations = {
  
  static int __init slab_proc_init(void)
  {
-       proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+       proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
  #ifdef CONFIG_DEBUG_SLAB_LEAK
         proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
  #endif