Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / mm / percpu.c
index 9e16d1c..bb4be74 100644 (file)
@@ -67,6 +67,7 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
+#include <linux/kmemleak.h>
 
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
@@ -116,9 +117,9 @@ static int pcpu_atom_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
 
-/* cpus with the lowest and highest unit numbers */
-static unsigned int pcpu_first_unit_cpu __read_mostly;
-static unsigned int pcpu_last_unit_cpu __read_mostly;
+/* cpus with the lowest and highest unit addresses */
+static unsigned int pcpu_low_unit_cpu __read_mostly;
+static unsigned int pcpu_high_unit_cpu __read_mostly;
 
 /* the address of the first chunk which starts with the kernel static area */
 void *pcpu_base_addr __read_mostly;
@@ -258,7 +259,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 
 /*
  * (Un)populated page region iterators.  Iterate over (un)populated
- * page regions betwen @start and @end in @chunk.  @rs and @re should
+ * page regions between @start and @end in @chunk.  @rs and @re should
  * be integer variables and will be set to start and end page index of
  * the current region.
  */
@@ -273,11 +274,11 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
             (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
 
 /**
- * pcpu_mem_alloc - allocate memory
+ * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
  *
  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vmalloc() is used.  The returned
+ * kzalloc() is used; otherwise, vzalloc() is used.  The returned
  * memory is always zeroed.
  *
  * CONTEXT:
@@ -286,7 +287,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
-static void *pcpu_mem_alloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size)
 {
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;
@@ -302,7 +303,7 @@ static void *pcpu_mem_alloc(size_t size)
  * @ptr: memory to free
  * @size: size of the area
  *
- * Free @ptr.  @ptr should have been allocated using pcpu_mem_alloc().
+ * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
  */
 static void pcpu_mem_free(void *ptr, size_t size)
 {
@@ -342,7 +343,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
  * @chunk: chunk of interest
  *
  * Determine whether area map of @chunk needs to be extended to
- * accomodate a new allocation.
+ * accommodate a new allocation.
  *
  * CONTEXT:
  * pcpu_lock.
@@ -384,7 +385,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
        size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
        unsigned long flags;
 
-       new = pcpu_mem_alloc(new_size);
+       new = pcpu_mem_zalloc(new_size);
        if (!new)
                return -ENOMEM;
 
@@ -431,7 +432,7 @@ out_unlock:
  * depending on @head, is reduced by @tail bytes and @tail byte block
  * is inserted after the target block.
  *
- * @chunk->map must have enough free slots to accomodate the split.
+ * @chunk->map must have enough free slots to accommodate the split.
  *
  * CONTEXT:
  * pcpu_lock.
@@ -604,11 +605,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 {
        struct pcpu_chunk *chunk;
 
-       chunk = pcpu_mem_alloc(pcpu_chunk_struct_size);
+       chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
        if (!chunk)
                return NULL;
 
-       chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+       chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
+                                               sizeof(chunk->map[0]));
        if (!chunk->map) {
                kfree(chunk);
                return NULL;
@@ -709,6 +711,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
        const char *err;
        int slot, off, new_alloc;
        unsigned long flags;
+       void __percpu *ptr;
 
        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
                WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -801,7 +804,9 @@ area_found:
        mutex_unlock(&pcpu_alloc_mutex);
 
        /* return address relative to base address */
-       return __addr_to_pcpu_ptr(chunk->base_addr + off);
+       ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
+       kmemleak_alloc_percpu(ptr, size);
+       return ptr;
 
 fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
@@ -915,6 +920,8 @@ void free_percpu(void __percpu *ptr)
        if (!ptr)
                return;
 
+       kmemleak_free_percpu(ptr);
+
        addr = __pcpu_ptr_to_addr(ptr);
 
        spin_lock_irqsave(&pcpu_lock, flags);
@@ -977,6 +984,17 @@ bool is_kernel_percpu_address(unsigned long addr)
  * address.  The caller is responsible for ensuring @addr stays valid
  * until this function finishes.
  *
+ * percpu allocator has special setup for the first chunk, which currently
+ * supports either embedding in linear address space or vmalloc mapping,
+ * and, from the second one, the backing allocator (currently either vm or
+ * km) provides translation.
+ *
+ * The addr can be tranlated simply without checking if it falls into the
+ * first chunk. But the current code reflects better how percpu allocator
+ * actually works, and the verification can discover both bugs in percpu
+ * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
+ * code.
+ *
  * RETURNS:
  * The physical address for @addr.
  */
@@ -984,19 +1002,19 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
 {
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
-       unsigned long first_start, first_end;
+       unsigned long first_low, first_high;
        unsigned int cpu;
 
        /*
-        * The following test on first_start/end isn't strictly
+        * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         */
-       first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
-       first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
-                                   pcpu_unit_pages);
-       if ((unsigned long)addr >= first_start &&
-           (unsigned long)addr < first_end) {
+       first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
+       first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
+                                    pcpu_unit_pages);
+       if ((unsigned long)addr >= first_low &&
+           (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);
 
@@ -1008,13 +1026,14 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
        }
 
        if (in_first_chunk) {
-               if ((unsigned long)addr < VMALLOC_START ||
-                   (unsigned long)addr >= VMALLOC_END)
+               if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
-                       return page_to_phys(vmalloc_to_page(addr));
+                       return page_to_phys(vmalloc_to_page(addr)) +
+                              offset_in_page(addr);
        } else
-               return page_to_phys(pcpu_addr_to_page(addr));
+               return page_to_phys(pcpu_addr_to_page(addr)) +
+                      offset_in_page(addr);
 }
 
 /**
@@ -1113,20 +1132,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
-                               printk("\n");
+                               printk(KERN_CONT "\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
-                       printk("[%0*d] ", group_width, group);
+                       printk(KERN_CONT "[%0*d] ", group_width, group);
 
                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
-                                       printk("%0*d ", cpu_width,
+                                       printk(KERN_CONT "%0*d ", cpu_width,
                                               gi->cpu_map[unit]);
                                else
-                                       printk("%s ", empty_str);
+                                       printk(KERN_CONT "%s ", empty_str);
                }
        }
-       printk("\n");
+       printk(KERN_CONT "\n");
 }
 
 /**
@@ -1216,8 +1235,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
 #ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
+       PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
 #endif
        PCPU_SETUP_BUG_ON(!base_addr);
+       PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
@@ -1232,7 +1253,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;
-       pcpu_first_unit_cpu = NR_CPUS;
+
+       pcpu_low_unit_cpu = NR_CPUS;
+       pcpu_high_unit_cpu = NR_CPUS;
 
        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];
@@ -1252,9 +1275,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;
 
-                       if (pcpu_first_unit_cpu == NR_CPUS)
-                               pcpu_first_unit_cpu = cpu;
-                       pcpu_last_unit_cpu = cpu;
+                       /* determine low/high unit_cpu */
+                       if (pcpu_low_unit_cpu == NR_CPUS ||
+                           unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
+                               pcpu_low_unit_cpu = cpu;
+                       if (pcpu_high_unit_cpu == NR_CPUS ||
+                           unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
+                               pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;
@@ -1264,7 +1291,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 
        /* we're done parsing the input, undefine BUG macro and dump config */
 #undef PCPU_SETUP_BUG_ON
-       pcpu_dump_alloc_info(KERN_INFO, ai);
+       pcpu_dump_alloc_info(KERN_DEBUG, ai);
 
        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
@@ -1436,7 +1463,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
-        * which can accomodate 4k aligned segments which are equal to
+        * which can accommodate 4k aligned segments which are equal to
         * or larger than min_unit_size.
         */
        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
@@ -1551,7 +1578,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
  * @atom_size: allocation atom size
  * @cpu_distance_fn: callback to determine distance between cpus, optional
  * @alloc_fn: function to allocate percpu page
- * @free_fn: funtion to free percpu page
+ * @free_fn: function to free percpu page
  *
  * This is a helper to ease setting up embedded first percpu chunk and
  * can be called where pcpu_setup_first_chunk() is expected.
@@ -1618,9 +1645,21 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                        rc = -ENOMEM;
                        goto out_free_areas;
                }
+               /* kmemleak tracks the percpu allocations separately */
+               kmemleak_free(ptr);
                areas[group] = ptr;
 
                base = min(ptr, base);
+       }
+
+       /*
+        * Copy data and free unused parts.  This should happen after all
+        * allocations are complete; otherwise, we may end up with
+        * overlapping groups.
+        */
+       for (group = 0; group < ai->nr_groups; group++) {
+               struct pcpu_group_info *gi = &ai->groups[group];
+               void *ptr = areas[group];
 
                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
@@ -1646,8 +1685,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
                pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
-                          "space 0x%lx\n",
-                          max_distance, VMALLOC_END - VMALLOC_START);
+                          "space 0x%lx\n", max_distance,
+                          (unsigned long)(VMALLOC_END - VMALLOC_START));
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
@@ -1679,7 +1718,7 @@ out_free:
  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
  * @reserved_size: the size of reserved percpu area in bytes
  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
- * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @free_fn: function to free percpu page, always called with PAGE_SIZE
  * @populate_pte_fn: function to populate pte
  *
  * This is a helper to ease setting up page-remapped first percpu
@@ -1732,6 +1771,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
                                           "for cpu%u\n", psize_str, cpu);
                                goto enomem;
                        }
+                       /* kmemleak tracks the percpu allocations separately */
+                       kmemleak_free(ptr);
                        pages[j++] = virt_to_page(ptr);
                }
 
@@ -1854,6 +1895,8 @@ void __init setup_per_cpu_areas(void)
        fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
+       /* kmemleak tracks the percpu allocations separately */
+       kmemleak_free(fc);
 
        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
@@ -1888,7 +1931,7 @@ void __init percpu_init_late(void)
 
                BUILD_BUG_ON(size > PAGE_SIZE);
 
-               map = pcpu_mem_alloc(size);
+               map = pcpu_mem_zalloc(size);
                BUG_ON(!map);
 
                spin_lock_irqsave(&pcpu_lock, flags);