vmstat: Optimize zone counter modifications through the use of this cpu operations
[linux-flexiantxendom0-3.2.10.git] / mm / vmstat.c
1 /*
2  *  linux/mm/vmstat.c
3  *
4  *  Manages VM statistics
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  *
7  *  zoned VM statistics
8  *  Copyright (C) 2006 Silicon Graphics, Inc.,
9  *              Christoph Lameter <christoph@lameter.com>
10  */
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/err.h>
14 #include <linux/module.h>
15 #include <linux/slab.h>
16 #include <linux/cpu.h>
17 #include <linux/vmstat.h>
18 #include <linux/sched.h>
19 #include <linux/math64.h>
20 #include <linux/writeback.h>
21 #include <linux/compaction.h>
22
23 #ifdef CONFIG_VM_EVENT_COUNTERS
24 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
25 EXPORT_PER_CPU_SYMBOL(vm_event_states);
26
27 static void sum_vm_events(unsigned long *ret)
28 {
29         int cpu;
30         int i;
31
32         memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
33
34         for_each_online_cpu(cpu) {
35                 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
36
37                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
38                         ret[i] += this->event[i];
39         }
40 }
41
42 /*
43  * Accumulate the vm event counters across all CPUs.
44  * The result is unavoidably approximate - it can change
45  * during and after execution of this function.
46 */
47 void all_vm_events(unsigned long *ret)
48 {
49         get_online_cpus();
50         sum_vm_events(ret);
51         put_online_cpus();
52 }
53 EXPORT_SYMBOL_GPL(all_vm_events);
54
55 #ifdef CONFIG_HOTPLUG
56 /*
57  * Fold the foreign cpu events into our own.
58  *
59  * This is adding to the events on one processor
60  * but keeps the global counts constant.
61  */
62 void vm_events_fold_cpu(int cpu)
63 {
64         struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
65         int i;
66
67         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
68                 count_vm_events(i, fold_state->event[i]);
69                 fold_state->event[i] = 0;
70         }
71 }
72 #endif /* CONFIG_HOTPLUG */
73
74 #endif /* CONFIG_VM_EVENT_COUNTERS */
75
76 /*
77  * Manage combined zone based / global counters
78  *
79  * vm_stat contains the global counters
80  */
81 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
82 EXPORT_SYMBOL(vm_stat);
83
84 #ifdef CONFIG_SMP
85
86 static int calculate_threshold(struct zone *zone)
87 {
88         int threshold;
89         int mem;        /* memory in 128 MB units */
90
91         /*
92          * The threshold scales with the number of processors and the amount
93          * of memory per zone. More memory means that we can defer updates for
94          * longer, more processors could lead to more contention.
95          * fls() is used to have a cheap way of logarithmic scaling.
96          *
97          * Some sample thresholds:
98          *
99          * Threshold    Processors      (fls)   Zonesize        fls(mem+1)
100          * ------------------------------------------------------------------
101          * 8            1               1       0.9-1 GB        4
102          * 16           2               2       0.9-1 GB        4
103          * 20           2               2       1-2 GB          5
104          * 24           2               2       2-4 GB          6
105          * 28           2               2       4-8 GB          7
106          * 32           2               2       8-16 GB         8
107          * 4            2               2       <128M           1
108          * 30           4               3       2-4 GB          5
109          * 48           4               3       8-16 GB         8
110          * 32           8               4       1-2 GB          4
111          * 32           8               4       0.9-1GB         4
112          * 10           16              5       <128M           1
113          * 40           16              5       900M            4
114          * 70           64              7       2-4 GB          5
115          * 84           64              7       4-8 GB          6
116          * 108          512             9       4-8 GB          6
117          * 125          1024            10      8-16 GB         8
118          * 125          1024            10      16-32 GB        9
119          */
120
121         mem = zone->present_pages >> (27 - PAGE_SHIFT);
122
123         threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
124
125         /*
126          * Maximum threshold is 125
127          */
128         threshold = min(125, threshold);
129
130         return threshold;
131 }
132
133 /*
134  * Refresh the thresholds for each zone.
135  */
136 static void refresh_zone_stat_thresholds(void)
137 {
138         struct zone *zone;
139         int cpu;
140         int threshold;
141
142         for_each_populated_zone(zone) {
143                 unsigned long max_drift, tolerate_drift;
144
145                 threshold = calculate_threshold(zone);
146
147                 for_each_online_cpu(cpu)
148                         per_cpu_ptr(zone->pageset, cpu)->stat_threshold
149                                                         = threshold;
150
151                 /*
152                  * Only set percpu_drift_mark if there is a danger that
153                  * NR_FREE_PAGES reports the low watermark is ok when in fact
154                  * the min watermark could be breached by an allocation
155                  */
156                 tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
157                 max_drift = num_online_cpus() * threshold;
158                 if (max_drift > tolerate_drift)
159                         zone->percpu_drift_mark = high_wmark_pages(zone) +
160                                         max_drift;
161         }
162 }
163
164 /*
165  * For use when we know that interrupts are disabled.
166  */
167 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
168                                 int delta)
169 {
170         struct per_cpu_pageset __percpu *pcp = zone->pageset;
171         s8 __percpu *p = pcp->vm_stat_diff + item;
172         long x;
173         long t;
174
175         x = delta + __this_cpu_read(*p);
176
177         t = __this_cpu_read(pcp->stat_threshold);
178
179         if (unlikely(x > t || x < -t)) {
180                 zone_page_state_add(x, zone, item);
181                 x = 0;
182         }
183         __this_cpu_write(*p, x);
184 }
185 EXPORT_SYMBOL(__mod_zone_page_state);
186
187 /*
188  * For an unknown interrupt state
189  */
190 void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
191                                         int delta)
192 {
193         unsigned long flags;
194
195         local_irq_save(flags);
196         __mod_zone_page_state(zone, item, delta);
197         local_irq_restore(flags);
198 }
199 EXPORT_SYMBOL(mod_zone_page_state);
200
201 /*
202  * Optimized increment and decrement functions.
203  *
204  * These are only for a single page and therefore can take a struct page *
205  * argument instead of struct zone *. This allows the inclusion of the code
206  * generated for page_zone(page) into the optimized functions.
207  *
208  * No overflow check is necessary and therefore the differential can be
209  * incremented or decremented in place which may allow the compilers to
210  * generate better code.
211  * The increment or decrement is known and therefore one boundary check can
212  * be omitted.
213  *
214  * NOTE: These functions are very performance sensitive. Change only
215  * with care.
216  *
217  * Some processors have inc/dec instructions that are atomic vs an interrupt.
218  * However, the code must first determine the differential location in a zone
219  * based on the processor number and then inc/dec the counter. There is no
220  * guarantee without disabling preemption that the processor will not change
221  * in between and therefore the atomicity vs. interrupt cannot be exploited
222  * in a useful way here.
223  */
224 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
225 {
226         struct per_cpu_pageset __percpu *pcp = zone->pageset;
227         s8 __percpu *p = pcp->vm_stat_diff + item;
228         s8 v, t;
229
230         __this_cpu_inc(*p);
231
232         v = __this_cpu_read(*p);
233         t = __this_cpu_read(pcp->stat_threshold);
234         if (unlikely(v > t)) {
235                 s8 overstep = t >> 1;
236
237                 zone_page_state_add(v + overstep, zone, item);
238                 __this_cpu_write(*p, -overstep);
239         }
240 }
241
242 void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
243 {
244         __inc_zone_state(page_zone(page), item);
245 }
246 EXPORT_SYMBOL(__inc_zone_page_state);
247
248 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
249 {
250         struct per_cpu_pageset __percpu *pcp = zone->pageset;
251         s8 __percpu *p = pcp->vm_stat_diff + item;
252         s8 v, t;
253
254         __this_cpu_dec(*p);
255
256         v = __this_cpu_read(*p);
257         t = __this_cpu_read(pcp->stat_threshold);
258         if (unlikely(v < - t)) {
259                 s8 overstep = t >> 1;
260
261                 zone_page_state_add(v - overstep, zone, item);
262                 __this_cpu_write(*p, overstep);
263         }
264 }
265
266 void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
267 {
268         __dec_zone_state(page_zone(page), item);
269 }
270 EXPORT_SYMBOL(__dec_zone_page_state);
271
272 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
273 {
274         unsigned long flags;
275
276         local_irq_save(flags);
277         __inc_zone_state(zone, item);
278         local_irq_restore(flags);
279 }
280
281 void inc_zone_page_state(struct page *page, enum zone_stat_item item)
282 {
283         unsigned long flags;
284         struct zone *zone;
285
286         zone = page_zone(page);
287         local_irq_save(flags);
288         __inc_zone_state(zone, item);
289         local_irq_restore(flags);
290 }
291 EXPORT_SYMBOL(inc_zone_page_state);
292
293 void dec_zone_page_state(struct page *page, enum zone_stat_item item)
294 {
295         unsigned long flags;
296
297         local_irq_save(flags);
298         __dec_zone_page_state(page, item);
299         local_irq_restore(flags);
300 }
301 EXPORT_SYMBOL(dec_zone_page_state);
302
303 /*
304  * Update the zone counters for one cpu.
305  *
306  * The cpu specified must be either the current cpu or a processor that
307  * is not online. If it is the current cpu then the execution thread must
308  * be pinned to the current cpu.
309  *
310  * Note that refresh_cpu_vm_stats strives to only access
311  * node local memory. The per cpu pagesets on remote zones are placed
312  * in the memory local to the processor using that pageset. So the
313  * loop over all zones will access a series of cachelines local to
314  * the processor.
315  *
316  * The call to zone_page_state_add updates the cachelines with the
317  * statistics in the remote zone struct as well as the global cachelines
318  * with the global counters. These could cause remote node cache line
319  * bouncing and will have to be only done when necessary.
320  */
321 void refresh_cpu_vm_stats(int cpu)
322 {
323         struct zone *zone;
324         int i;
325         int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
326
327         for_each_populated_zone(zone) {
328                 struct per_cpu_pageset *p;
329
330                 p = per_cpu_ptr(zone->pageset, cpu);
331
332                 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
333                         if (p->vm_stat_diff[i]) {
334                                 unsigned long flags;
335                                 int v;
336
337                                 local_irq_save(flags);
338                                 v = p->vm_stat_diff[i];
339                                 p->vm_stat_diff[i] = 0;
340                                 local_irq_restore(flags);
341                                 atomic_long_add(v, &zone->vm_stat[i]);
342                                 global_diff[i] += v;
343 #ifdef CONFIG_NUMA
344                                 /* 3 seconds idle till flush */
345                                 p->expire = 3;
346 #endif
347                         }
348                 cond_resched();
349 #ifdef CONFIG_NUMA
350                 /*
351                  * Deal with draining the remote pageset of this
352                  * processor
353                  *
354                  * Check if there are pages remaining in this pageset
355                  * if not then there is nothing to expire.
356                  */
357                 if (!p->expire || !p->pcp.count)
358                         continue;
359
360                 /*
361                  * We never drain zones local to this processor.
362                  */
363                 if (zone_to_nid(zone) == numa_node_id()) {
364                         p->expire = 0;
365                         continue;
366                 }
367
368                 p->expire--;
369                 if (p->expire)
370                         continue;
371
372                 if (p->pcp.count)
373                         drain_zone_pages(zone, &p->pcp);
374 #endif
375         }
376
377         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
378                 if (global_diff[i])
379                         atomic_long_add(global_diff[i], &vm_stat[i]);
380 }
381
382 #endif
383
384 #ifdef CONFIG_NUMA
385 /*
386  * zonelist = the list of zones passed to the allocator
387  * z        = the zone from which the allocation occurred.
388  *
389  * Must be called with interrupts disabled.
390  */
391 void zone_statistics(struct zone *preferred_zone, struct zone *z)
392 {
393         if (z->zone_pgdat == preferred_zone->zone_pgdat) {
394                 __inc_zone_state(z, NUMA_HIT);
395         } else {
396                 __inc_zone_state(z, NUMA_MISS);
397                 __inc_zone_state(preferred_zone, NUMA_FOREIGN);
398         }
399         if (z->node == numa_node_id())
400                 __inc_zone_state(z, NUMA_LOCAL);
401         else
402                 __inc_zone_state(z, NUMA_OTHER);
403 }
404 #endif
405
406 #ifdef CONFIG_COMPACTION
407
408 struct contig_page_info {
409         unsigned long free_pages;
410         unsigned long free_blocks_total;
411         unsigned long free_blocks_suitable;
412 };
413
414 /*
415  * Calculate the number of free pages in a zone, how many contiguous
416  * pages are free and how many are large enough to satisfy an allocation of
417  * the target size. Note that this function makes no attempt to estimate
418  * how many suitable free blocks there *might* be if MOVABLE pages were
419  * migrated. Calculating that is possible, but expensive and can be
420  * figured out from userspace
421  */
422 static void fill_contig_page_info(struct zone *zone,
423                                 unsigned int suitable_order,
424                                 struct contig_page_info *info)
425 {
426         unsigned int order;
427
428         info->free_pages = 0;
429         info->free_blocks_total = 0;
430         info->free_blocks_suitable = 0;
431
432         for (order = 0; order < MAX_ORDER; order++) {
433                 unsigned long blocks;
434
435                 /* Count number of free blocks */
436                 blocks = zone->free_area[order].nr_free;
437                 info->free_blocks_total += blocks;
438
439                 /* Count free base pages */
440                 info->free_pages += blocks << order;
441
442                 /* Count the suitable free blocks */
443                 if (order >= suitable_order)
444                         info->free_blocks_suitable += blocks <<
445                                                 (order - suitable_order);
446         }
447 }
448
449 /*
450  * A fragmentation index only makes sense if an allocation of a requested
451  * size would fail. If that is true, the fragmentation index indicates
452  * whether external fragmentation or a lack of memory was the problem.
453  * The value can be used to determine if page reclaim or compaction
454  * should be used
455  */
456 static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
457 {
458         unsigned long requested = 1UL << order;
459
460         if (!info->free_blocks_total)
461                 return 0;
462
463         /* Fragmentation index only makes sense when a request would fail */
464         if (info->free_blocks_suitable)
465                 return -1000;
466
467         /*
468          * Index is between 0 and 1 so return within 3 decimal places
469          *
470          * 0 => allocation would fail due to lack of memory
471          * 1 => allocation would fail due to fragmentation
472          */
473         return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
474 }
475
476 /* Same as __fragmentation index but allocs contig_page_info on stack */
477 int fragmentation_index(struct zone *zone, unsigned int order)
478 {
479         struct contig_page_info info;
480
481         fill_contig_page_info(zone, order, &info);
482         return __fragmentation_index(order, &info);
483 }
484 #endif
485
486 #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
487 #include <linux/proc_fs.h>
488 #include <linux/seq_file.h>
489
490 static char * const migratetype_names[MIGRATE_TYPES] = {
491         "Unmovable",
492         "Reclaimable",
493         "Movable",
494         "Reserve",
495         "Isolate",
496 };
497
498 static void *frag_start(struct seq_file *m, loff_t *pos)
499 {
500         pg_data_t *pgdat;
501         loff_t node = *pos;
502         for (pgdat = first_online_pgdat();
503              pgdat && node;
504              pgdat = next_online_pgdat(pgdat))
505                 --node;
506
507         return pgdat;
508 }
509
510 static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
511 {
512         pg_data_t *pgdat = (pg_data_t *)arg;
513
514         (*pos)++;
515         return next_online_pgdat(pgdat);
516 }
517
518 static void frag_stop(struct seq_file *m, void *arg)
519 {
520 }
521
522 /* Walk all the zones in a node and print using a callback */
523 static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
524                 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
525 {
526         struct zone *zone;
527         struct zone *node_zones = pgdat->node_zones;
528         unsigned long flags;
529
530         for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
531                 if (!populated_zone(zone))
532                         continue;
533
534                 spin_lock_irqsave(&zone->lock, flags);
535                 print(m, pgdat, zone);
536                 spin_unlock_irqrestore(&zone->lock, flags);
537         }
538 }
539 #endif
540
541 #ifdef CONFIG_PROC_FS
542 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
543                                                 struct zone *zone)
544 {
545         int order;
546
547         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
548         for (order = 0; order < MAX_ORDER; ++order)
549                 seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
550         seq_putc(m, '\n');
551 }
552
553 /*
554  * This walks the free areas for each zone.
555  */
556 static int frag_show(struct seq_file *m, void *arg)
557 {
558         pg_data_t *pgdat = (pg_data_t *)arg;
559         walk_zones_in_node(m, pgdat, frag_show_print);
560         return 0;
561 }
562
563 static void pagetypeinfo_showfree_print(struct seq_file *m,
564                                         pg_data_t *pgdat, struct zone *zone)
565 {
566         int order, mtype;
567
568         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
569                 seq_printf(m, "Node %4d, zone %8s, type %12s ",
570                                         pgdat->node_id,
571                                         zone->name,
572                                         migratetype_names[mtype]);
573                 for (order = 0; order < MAX_ORDER; ++order) {
574                         unsigned long freecount = 0;
575                         struct free_area *area;
576                         struct list_head *curr;
577
578                         area = &(zone->free_area[order]);
579
580                         list_for_each(curr, &area->free_list[mtype])
581                                 freecount++;
582                         seq_printf(m, "%6lu ", freecount);
583                 }
584                 seq_putc(m, '\n');
585         }
586 }
587
588 /* Print out the free pages at each order for each migatetype */
589 static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
590 {
591         int order;
592         pg_data_t *pgdat = (pg_data_t *)arg;
593
594         /* Print header */
595         seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
596         for (order = 0; order < MAX_ORDER; ++order)
597                 seq_printf(m, "%6d ", order);
598         seq_putc(m, '\n');
599
600         walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
601
602         return 0;
603 }
604
605 static void pagetypeinfo_showblockcount_print(struct seq_file *m,
606                                         pg_data_t *pgdat, struct zone *zone)
607 {
608         int mtype;
609         unsigned long pfn;
610         unsigned long start_pfn = zone->zone_start_pfn;
611         unsigned long end_pfn = start_pfn + zone->spanned_pages;
612         unsigned long count[MIGRATE_TYPES] = { 0, };
613
614         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
615                 struct page *page;
616
617                 if (!pfn_valid(pfn))
618                         continue;
619
620                 page = pfn_to_page(pfn);
621
622                 /* Watch for unexpected holes punched in the memmap */
623                 if (!memmap_valid_within(pfn, page, zone))
624                         continue;
625
626                 mtype = get_pageblock_migratetype(page);
627
628                 if (mtype < MIGRATE_TYPES)
629                         count[mtype]++;
630         }
631
632         /* Print counts */
633         seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
634         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
635                 seq_printf(m, "%12lu ", count[mtype]);
636         seq_putc(m, '\n');
637 }
638
639 /* Print out the free pages at each order for each migratetype */
640 static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
641 {
642         int mtype;
643         pg_data_t *pgdat = (pg_data_t *)arg;
644
645         seq_printf(m, "\n%-23s", "Number of blocks type ");
646         for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
647                 seq_printf(m, "%12s ", migratetype_names[mtype]);
648         seq_putc(m, '\n');
649         walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
650
651         return 0;
652 }
653
654 /*
655  * This prints out statistics in relation to grouping pages by mobility.
656  * It is expensive to collect so do not constantly read the file.
657  */
658 static int pagetypeinfo_show(struct seq_file *m, void *arg)
659 {
660         pg_data_t *pgdat = (pg_data_t *)arg;
661
662         /* check memoryless node */
663         if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
664                 return 0;
665
666         seq_printf(m, "Page block order: %d\n", pageblock_order);
667         seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
668         seq_putc(m, '\n');
669         pagetypeinfo_showfree(m, pgdat);
670         pagetypeinfo_showblockcount(m, pgdat);
671
672         return 0;
673 }
674
675 static const struct seq_operations fragmentation_op = {
676         .start  = frag_start,
677         .next   = frag_next,
678         .stop   = frag_stop,
679         .show   = frag_show,
680 };
681
682 static int fragmentation_open(struct inode *inode, struct file *file)
683 {
684         return seq_open(file, &fragmentation_op);
685 }
686
687 static const struct file_operations fragmentation_file_operations = {
688         .open           = fragmentation_open,
689         .read           = seq_read,
690         .llseek         = seq_lseek,
691         .release        = seq_release,
692 };
693
694 static const struct seq_operations pagetypeinfo_op = {
695         .start  = frag_start,
696         .next   = frag_next,
697         .stop   = frag_stop,
698         .show   = pagetypeinfo_show,
699 };
700
701 static int pagetypeinfo_open(struct inode *inode, struct file *file)
702 {
703         return seq_open(file, &pagetypeinfo_op);
704 }
705
706 static const struct file_operations pagetypeinfo_file_ops = {
707         .open           = pagetypeinfo_open,
708         .read           = seq_read,
709         .llseek         = seq_lseek,
710         .release        = seq_release,
711 };
712
713 #ifdef CONFIG_ZONE_DMA
714 #define TEXT_FOR_DMA(xx) xx "_dma",
715 #else
716 #define TEXT_FOR_DMA(xx)
717 #endif
718
719 #ifdef CONFIG_ZONE_DMA32
720 #define TEXT_FOR_DMA32(xx) xx "_dma32",
721 #else
722 #define TEXT_FOR_DMA32(xx)
723 #endif
724
725 #ifdef CONFIG_HIGHMEM
726 #define TEXT_FOR_HIGHMEM(xx) xx "_high",
727 #else
728 #define TEXT_FOR_HIGHMEM(xx)
729 #endif
730
731 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
732                                         TEXT_FOR_HIGHMEM(xx) xx "_movable",
733
734 static const char * const vmstat_text[] = {
735         /* Zoned VM counters */
736         "nr_free_pages",
737         "nr_inactive_anon",
738         "nr_active_anon",
739         "nr_inactive_file",
740         "nr_active_file",
741         "nr_unevictable",
742         "nr_mlock",
743         "nr_anon_pages",
744         "nr_mapped",
745         "nr_file_pages",
746         "nr_dirty",
747         "nr_writeback",
748         "nr_slab_reclaimable",
749         "nr_slab_unreclaimable",
750         "nr_page_table_pages",
751         "nr_kernel_stack",
752         "nr_unstable",
753         "nr_bounce",
754         "nr_vmscan_write",
755         "nr_writeback_temp",
756         "nr_isolated_anon",
757         "nr_isolated_file",
758         "nr_shmem",
759         "nr_dirtied",
760         "nr_written",
761
762 #ifdef CONFIG_NUMA
763         "numa_hit",
764         "numa_miss",
765         "numa_foreign",
766         "numa_interleave",
767         "numa_local",
768         "numa_other",
769 #endif
770         "nr_dirty_threshold",
771         "nr_dirty_background_threshold",
772
773 #ifdef CONFIG_VM_EVENT_COUNTERS
774         "pgpgin",
775         "pgpgout",
776         "pswpin",
777         "pswpout",
778
779         TEXTS_FOR_ZONES("pgalloc")
780
781         "pgfree",
782         "pgactivate",
783         "pgdeactivate",
784
785         "pgfault",
786         "pgmajfault",
787
788         TEXTS_FOR_ZONES("pgrefill")
789         TEXTS_FOR_ZONES("pgsteal")
790         TEXTS_FOR_ZONES("pgscan_kswapd")
791         TEXTS_FOR_ZONES("pgscan_direct")
792
793 #ifdef CONFIG_NUMA
794         "zone_reclaim_failed",
795 #endif
796         "pginodesteal",
797         "slabs_scanned",
798         "kswapd_steal",
799         "kswapd_inodesteal",
800         "kswapd_low_wmark_hit_quickly",
801         "kswapd_high_wmark_hit_quickly",
802         "kswapd_skip_congestion_wait",
803         "pageoutrun",
804         "allocstall",
805
806         "pgrotated",
807
808 #ifdef CONFIG_COMPACTION
809         "compact_blocks_moved",
810         "compact_pages_moved",
811         "compact_pagemigrate_failed",
812         "compact_stall",
813         "compact_fail",
814         "compact_success",
815 #endif
816
817 #ifdef CONFIG_HUGETLB_PAGE
818         "htlb_buddy_alloc_success",
819         "htlb_buddy_alloc_fail",
820 #endif
821         "unevictable_pgs_culled",
822         "unevictable_pgs_scanned",
823         "unevictable_pgs_rescued",
824         "unevictable_pgs_mlocked",
825         "unevictable_pgs_munlocked",
826         "unevictable_pgs_cleared",
827         "unevictable_pgs_stranded",
828         "unevictable_pgs_mlockfreed",
829 #endif
830 };
831
832 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
833                                                         struct zone *zone)
834 {
835         int i;
836         seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
837         seq_printf(m,
838                    "\n  pages free     %lu"
839                    "\n        min      %lu"
840                    "\n        low      %lu"
841                    "\n        high     %lu"
842                    "\n        scanned  %lu"
843                    "\n        spanned  %lu"
844                    "\n        present  %lu",
845                    zone_nr_free_pages(zone),
846                    min_wmark_pages(zone),
847                    low_wmark_pages(zone),
848                    high_wmark_pages(zone),
849                    zone->pages_scanned,
850                    zone->spanned_pages,
851                    zone->present_pages);
852
853         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
854                 seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
855                                 zone_page_state(zone, i));
856
857         seq_printf(m,
858                    "\n        protection: (%lu",
859                    zone->lowmem_reserve[0]);
860         for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
861                 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
862         seq_printf(m,
863                    ")"
864                    "\n  pagesets");
865         for_each_online_cpu(i) {
866                 struct per_cpu_pageset *pageset;
867
868                 pageset = per_cpu_ptr(zone->pageset, i);
869                 seq_printf(m,
870                            "\n    cpu: %i"
871                            "\n              count: %i"
872                            "\n              high:  %i"
873                            "\n              batch: %i",
874                            i,
875                            pageset->pcp.count,
876                            pageset->pcp.high,
877                            pageset->pcp.batch);
878 #ifdef CONFIG_SMP
879                 seq_printf(m, "\n  vm stats threshold: %d",
880                                 pageset->stat_threshold);
881 #endif
882         }
883         seq_printf(m,
884                    "\n  all_unreclaimable: %u"
885                    "\n  start_pfn:         %lu"
886                    "\n  inactive_ratio:    %u",
887                    zone->all_unreclaimable,
888                    zone->zone_start_pfn,
889                    zone->inactive_ratio);
890         seq_putc(m, '\n');
891 }
892
893 /*
894  * Output information about zones in @pgdat.
895  */
896 static int zoneinfo_show(struct seq_file *m, void *arg)
897 {
898         pg_data_t *pgdat = (pg_data_t *)arg;
899         walk_zones_in_node(m, pgdat, zoneinfo_show_print);
900         return 0;
901 }
902
903 static const struct seq_operations zoneinfo_op = {
904         .start  = frag_start, /* iterate over all zones. The same as in
905                                * fragmentation. */
906         .next   = frag_next,
907         .stop   = frag_stop,
908         .show   = zoneinfo_show,
909 };
910
911 static int zoneinfo_open(struct inode *inode, struct file *file)
912 {
913         return seq_open(file, &zoneinfo_op);
914 }
915
916 static const struct file_operations proc_zoneinfo_file_operations = {
917         .open           = zoneinfo_open,
918         .read           = seq_read,
919         .llseek         = seq_lseek,
920         .release        = seq_release,
921 };
922
923 enum writeback_stat_item {
924         NR_DIRTY_THRESHOLD,
925         NR_DIRTY_BG_THRESHOLD,
926         NR_VM_WRITEBACK_STAT_ITEMS,
927 };
928
929 static void *vmstat_start(struct seq_file *m, loff_t *pos)
930 {
931         unsigned long *v;
932         int i, stat_items_size;
933
934         if (*pos >= ARRAY_SIZE(vmstat_text))
935                 return NULL;
936         stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
937                           NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
938
939 #ifdef CONFIG_VM_EVENT_COUNTERS
940         stat_items_size += sizeof(struct vm_event_state);
941 #endif
942
943         v = kmalloc(stat_items_size, GFP_KERNEL);
944         m->private = v;
945         if (!v)
946                 return ERR_PTR(-ENOMEM);
947         for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
948                 v[i] = global_page_state(i);
949         v += NR_VM_ZONE_STAT_ITEMS;
950
951         global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
952                             v + NR_DIRTY_THRESHOLD);
953         v += NR_VM_WRITEBACK_STAT_ITEMS;
954
955 #ifdef CONFIG_VM_EVENT_COUNTERS
956         all_vm_events(v);
957         v[PGPGIN] /= 2;         /* sectors -> kbytes */
958         v[PGPGOUT] /= 2;
959 #endif
960         return (unsigned long *)m->private + *pos;
961 }
962
963 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
964 {
965         (*pos)++;
966         if (*pos >= ARRAY_SIZE(vmstat_text))
967                 return NULL;
968         return (unsigned long *)m->private + *pos;
969 }
970
971 static int vmstat_show(struct seq_file *m, void *arg)
972 {
973         unsigned long *l = arg;
974         unsigned long off = l - (unsigned long *)m->private;
975
976         seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
977         return 0;
978 }
979
980 static void vmstat_stop(struct seq_file *m, void *arg)
981 {
982         kfree(m->private);
983         m->private = NULL;
984 }
985
986 static const struct seq_operations vmstat_op = {
987         .start  = vmstat_start,
988         .next   = vmstat_next,
989         .stop   = vmstat_stop,
990         .show   = vmstat_show,
991 };
992
993 static int vmstat_open(struct inode *inode, struct file *file)
994 {
995         return seq_open(file, &vmstat_op);
996 }
997
998 static const struct file_operations proc_vmstat_file_operations = {
999         .open           = vmstat_open,
1000         .read           = seq_read,
1001         .llseek         = seq_lseek,
1002         .release        = seq_release,
1003 };
1004 #endif /* CONFIG_PROC_FS */
1005
1006 #ifdef CONFIG_SMP
1007 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1008 int sysctl_stat_interval __read_mostly = HZ;
1009
1010 static void vmstat_update(struct work_struct *w)
1011 {
1012         refresh_cpu_vm_stats(smp_processor_id());
1013         schedule_delayed_work(&__get_cpu_var(vmstat_work),
1014                 round_jiffies_relative(sysctl_stat_interval));
1015 }
1016
1017 static void __cpuinit start_cpu_timer(int cpu)
1018 {
1019         struct delayed_work *work = &per_cpu(vmstat_work, cpu);
1020
1021         INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
1022         schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
1023 }
1024
1025 /*
1026  * Use the cpu notifier to insure that the thresholds are recalculated
1027  * when necessary.
1028  */
1029 static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1030                 unsigned long action,
1031                 void *hcpu)
1032 {
1033         long cpu = (long)hcpu;
1034
1035         switch (action) {
1036         case CPU_ONLINE:
1037         case CPU_ONLINE_FROZEN:
1038                 refresh_zone_stat_thresholds();
1039                 start_cpu_timer(cpu);
1040                 node_set_state(cpu_to_node(cpu), N_CPU);
1041                 break;
1042         case CPU_DOWN_PREPARE:
1043         case CPU_DOWN_PREPARE_FROZEN:
1044                 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
1045                 per_cpu(vmstat_work, cpu).work.func = NULL;
1046                 break;
1047         case CPU_DOWN_FAILED:
1048         case CPU_DOWN_FAILED_FROZEN:
1049                 start_cpu_timer(cpu);
1050                 break;
1051         case CPU_DEAD:
1052         case CPU_DEAD_FROZEN:
1053                 refresh_zone_stat_thresholds();
1054                 break;
1055         default:
1056                 break;
1057         }
1058         return NOTIFY_OK;
1059 }
1060
1061 static struct notifier_block __cpuinitdata vmstat_notifier =
1062         { &vmstat_cpuup_callback, NULL, 0 };
1063 #endif
1064
1065 static int __init setup_vmstat(void)
1066 {
1067 #ifdef CONFIG_SMP
1068         int cpu;
1069
1070         refresh_zone_stat_thresholds();
1071         register_cpu_notifier(&vmstat_notifier);
1072
1073         for_each_online_cpu(cpu)
1074                 start_cpu_timer(cpu);
1075 #endif
1076 #ifdef CONFIG_PROC_FS
1077         proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
1078         proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
1079         proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
1080         proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
1081 #endif
1082         return 0;
1083 }
1084 module_init(setup_vmstat)
1085
1086 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1087 #include <linux/debugfs.h>
1088
1089 static struct dentry *extfrag_debug_root;
1090
1091 /*
1092  * Return an index indicating how much of the available free memory is
1093  * unusable for an allocation of the requested size.
1094  */
1095 static int unusable_free_index(unsigned int order,
1096                                 struct contig_page_info *info)
1097 {
1098         /* No free memory is interpreted as all free memory is unusable */
1099         if (info->free_pages == 0)
1100                 return 1000;
1101
1102         /*
1103          * Index should be a value between 0 and 1. Return a value to 3
1104          * decimal places.
1105          *
1106          * 0 => no fragmentation
1107          * 1 => high fragmentation
1108          */
1109         return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages);
1110
1111 }
1112
1113 static void unusable_show_print(struct seq_file *m,
1114                                         pg_data_t *pgdat, struct zone *zone)
1115 {
1116         unsigned int order;
1117         int index;
1118         struct contig_page_info info;
1119
1120         seq_printf(m, "Node %d, zone %8s ",
1121                                 pgdat->node_id,
1122                                 zone->name);
1123         for (order = 0; order < MAX_ORDER; ++order) {
1124                 fill_contig_page_info(zone, order, &info);
1125                 index = unusable_free_index(order, &info);
1126                 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1127         }
1128
1129         seq_putc(m, '\n');
1130 }
1131
1132 /*
1133  * Display unusable free space index
1134  *
1135  * The unusable free space index measures how much of the available free
1136  * memory cannot be used to satisfy an allocation of a given size and is a
1137  * value between 0 and 1. The higher the value, the more of free memory is
1138  * unusable and by implication, the worse the external fragmentation is. This
1139  * can be expressed as a percentage by multiplying by 100.
1140  */
1141 static int unusable_show(struct seq_file *m, void *arg)
1142 {
1143         pg_data_t *pgdat = (pg_data_t *)arg;
1144
1145         /* check memoryless node */
1146         if (!node_state(pgdat->node_id, N_HIGH_MEMORY))
1147                 return 0;
1148
1149         walk_zones_in_node(m, pgdat, unusable_show_print);
1150
1151         return 0;
1152 }
1153
1154 static const struct seq_operations unusable_op = {
1155         .start  = frag_start,
1156         .next   = frag_next,
1157         .stop   = frag_stop,
1158         .show   = unusable_show,
1159 };
1160
1161 static int unusable_open(struct inode *inode, struct file *file)
1162 {
1163         return seq_open(file, &unusable_op);
1164 }
1165
1166 static const struct file_operations unusable_file_ops = {
1167         .open           = unusable_open,
1168         .read           = seq_read,
1169         .llseek         = seq_lseek,
1170         .release        = seq_release,
1171 };
1172
1173 static void extfrag_show_print(struct seq_file *m,
1174                                         pg_data_t *pgdat, struct zone *zone)
1175 {
1176         unsigned int order;
1177         int index;
1178
1179         /* Alloc on stack as interrupts are disabled for zone walk */
1180         struct contig_page_info info;
1181
1182         seq_printf(m, "Node %d, zone %8s ",
1183                                 pgdat->node_id,
1184                                 zone->name);
1185         for (order = 0; order < MAX_ORDER; ++order) {
1186                 fill_contig_page_info(zone, order, &info);
1187                 index = __fragmentation_index(order, &info);
1188                 seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
1189         }
1190
1191         seq_putc(m, '\n');
1192 }
1193
1194 /*
1195  * Display fragmentation index for orders that allocations would fail for
1196  */
1197 static int extfrag_show(struct seq_file *m, void *arg)
1198 {
1199         pg_data_t *pgdat = (pg_data_t *)arg;
1200
1201         walk_zones_in_node(m, pgdat, extfrag_show_print);
1202
1203         return 0;
1204 }
1205
1206 static const struct seq_operations extfrag_op = {
1207         .start  = frag_start,
1208         .next   = frag_next,
1209         .stop   = frag_stop,
1210         .show   = extfrag_show,
1211 };
1212
1213 static int extfrag_open(struct inode *inode, struct file *file)
1214 {
1215         return seq_open(file, &extfrag_op);
1216 }
1217
1218 static const struct file_operations extfrag_file_ops = {
1219         .open           = extfrag_open,
1220         .read           = seq_read,
1221         .llseek         = seq_lseek,
1222         .release        = seq_release,
1223 };
1224
1225 static int __init extfrag_debug_init(void)
1226 {
1227         extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1228         if (!extfrag_debug_root)
1229                 return -ENOMEM;
1230
1231         if (!debugfs_create_file("unusable_index", 0444,
1232                         extfrag_debug_root, NULL, &unusable_file_ops))
1233                 return -ENOMEM;
1234
1235         if (!debugfs_create_file("extfrag_index", 0444,
1236                         extfrag_debug_root, NULL, &extfrag_file_ops))
1237                 return -ENOMEM;
1238
1239         return 0;
1240 }
1241
1242 module_init(extfrag_debug_init);
1243 #endif