+- add patches.fixes/linux-post-2.6.3-20040220
[linux-flexiantxendom0-3.2.10.git] / kernel / power / swsusp.c
1 /*
2  * linux/kernel/suspend.c
3  *
4  * This file is to realize architecture-independent
5  * machine suspend feature using pretty near only high-level routines
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001-2003 Pavel Machek <pavel@suse.cz>
9  *
10  * This file is released under the GPLv2.
11  *
12  * I'd like to thank the following people for their work:
13  * 
14  * Pavel Machek <pavel@ucw.cz>:
15  * Modifications, defectiveness pointing, being with me at the very beginning,
16  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17  *
18  * Steve Doddi <dirk@loth.demon.co.uk>: 
19  * Support the possibility of hardware state restoring.
20  *
21  * Raph <grey.havens@earthling.net>:
22  * Support for preserving states of network devices and virtual console
23  * (including X and svgatextmode)
24  *
25  * Kurt Garloff <garloff@suse.de>:
26  * Straightened the critical function in order to prevent compilers from
27  * playing tricks with local variables.
28  *
29  * Andreas Mohr <a.mohr@mailto.de>
30  *
31  * Alex Badea <vampire@go.ro>:
32  * Fixed runaway init
33  *
34  * More state savers are welcome. Especially for the scsi layer...
35  *
36  * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
37  */
38
39 #include <linux/module.h>
40 #include <linux/mm.h>
41 #include <linux/suspend.h>
42 #include <linux/smp_lock.h>
43 #include <linux/file.h>
44 #include <linux/utsname.h>
45 #include <linux/version.h>
46 #include <linux/delay.h>
47 #include <linux/reboot.h>
48 #include <linux/bitops.h>
49 #include <linux/vt_kern.h>
50 #include <linux/kbd_kern.h>
51 #include <linux/keyboard.h>
52 #include <linux/spinlock.h>
53 #include <linux/genhd.h>
54 #include <linux/kernel.h>
55 #include <linux/major.h>
56 #include <linux/swap.h>
57 #include <linux/pm.h>
58 #include <linux/device.h>
59 #include <linux/buffer_head.h>
60 #include <linux/swapops.h>
61 #include <linux/bootmem.h>
62 #include <linux/console.h>
63
64 #include <asm/uaccess.h>
65 #include <asm/mmu_context.h>
66 #include <asm/pgtable.h>
67 #include <asm/io.h>
68
69 #include "power.h"
70
71 extern long sys_sync(void);
72
73 unsigned char software_suspend_enabled = 0;
74
75 extern void do_magic(int resume);
76
77 #define NORESUME                1
78 #define RESUME_SPECIFIED        2
79
80
81 #define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
82 #define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
83 #define ADDRESS2(x) __ADDRESS(__pa(x))          /* Needed for x86-64 where some pages are in memory twice */
84
85 /* References to section boundaries */
86 extern char __nosave_begin, __nosave_end;
87
88 extern int is_head_of_free_region(struct page *);
89
90 /* Locks */
91 spinlock_t suspend_pagedir_lock __nosavedata = SPIN_LOCK_UNLOCKED;
92
93 /* Variables to be preserved over suspend */
94 static int pagedir_order_check;
95 static int nr_copy_pages_check;
96
97 static int resume_status;
98 static char resume_file[256] = "";                      /* For resume= kernel option */
99 static dev_t resume_device;
100 /* Local variables that should not be affected by save */
101 unsigned int nr_copy_pages __nosavedata = 0;
102
103 /* Suspend pagedir is allocated before final copy, therefore it
104    must be freed after resume 
105
106    Warning: this is evil. There are actually two pagedirs at time of
107    resume. One is "pagedir_save", which is empty frame allocated at
108    time of suspend, that must be freed. Second is "pagedir_nosave", 
109    allocated at time of resume, that travels through memory not to
110    collide with anything.
111  */
112 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
113 static suspend_pagedir_t *pagedir_save;
114 static int pagedir_order __nosavedata = 0;
115
116 struct link {
117         char dummy[PAGE_SIZE - sizeof(swp_entry_t)];
118         swp_entry_t next;
119 };
120
121 union diskpage {
122         union swap_header swh;
123         struct link link;
124         struct suspend_header sh;
125 };
126
127 /*
128  * XXX: We try to keep some more pages free so that I/O operations succeed
129  * without paging. Might this be more?
130  */
131 #define PAGES_FOR_IO    512
132
133 static const char name_suspend[] = "Suspend Machine: ";
134 static const char name_resume[] = "Resume Machine: ";
135
136 /*
137  * Debug
138  */
139 #define DEBUG_DEFAULT
140 #undef  DEBUG_PROCESS
141 #undef  DEBUG_SLOW
142 #define TEST_SWSUSP 0           /* Set to 1 to reboot instead of halt machine after suspension */
143
144 #ifdef DEBUG_DEFAULT
145 # define PRINTK(f, a...)       printk(f, ## a)
146 #else
147 # define PRINTK(f, a...)
148 #endif
149
150 #ifdef DEBUG_SLOW
151 #define MDELAY(a) mdelay(a)
152 #else
153 #define MDELAY(a)
154 #endif
155
156 /*
157  * Saving part...
158  */
159
160 static __inline__ int fill_suspend_header(struct suspend_header *sh)
161 {
162         memset((char *)sh, 0, sizeof(*sh));
163
164         sh->version_code = LINUX_VERSION_CODE;
165         sh->num_physpages = num_physpages;
166         strncpy(sh->machine, system_utsname.machine, 8);
167         strncpy(sh->version, system_utsname.version, 20);
168         /* FIXME: Is this bogus? --RR */
169         sh->num_cpus = num_online_cpus();
170         sh->page_size = PAGE_SIZE;
171         sh->suspend_pagedir = pagedir_nosave;
172         BUG_ON (pagedir_save != pagedir_nosave);
173         sh->num_pbes = nr_copy_pages;
174         /* TODO: needed? mounted fs' last mounted date comparison
175          * [so they haven't been mounted since last suspend.
176          * Maybe it isn't.] [we'd need to do this for _all_ fs-es]
177          */
178         return 0;
179 }
180
181 /* We memorize in swapfile_used what swap devices are used for suspension */
182 #define SWAPFILE_UNUSED    0
183 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
184 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
185
186 static unsigned short swapfile_used[MAX_SWAPFILES];
187 static unsigned short root_swap;
188 #define MARK_SWAP_SUSPEND 0
189 #define MARK_SWAP_RESUME 2
190
191 static void mark_swapfiles(swp_entry_t prev, int mode)
192 {
193         swp_entry_t entry;
194         union diskpage *cur;
195         struct page *page;
196
197         if (root_swap == 0xFFFF)  /* ignored */
198                 return;
199
200         page = alloc_page(GFP_ATOMIC);
201         if (!page)
202                 panic("Out of memory in mark_swapfiles");
203         cur = page_address(page);
204         /* XXX: this is dirty hack to get first page of swap file */
205         entry = swp_entry(root_swap, 0);
206         rw_swap_page_sync(READ, entry, page);
207
208         if (mode == MARK_SWAP_RESUME) {
209                 if (!memcmp("S1",cur->swh.magic.magic,2))
210                         memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
211                 else if (!memcmp("S2",cur->swh.magic.magic,2))
212                         memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
213                 else printk("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
214                         name_resume, cur->swh.magic.magic);
215         } else {
216                 if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)))
217                         memcpy(cur->swh.magic.magic,"S1SUSP....",10);
218                 else if ((!memcmp("SWAPSPACE2",cur->swh.magic.magic,10)))
219                         memcpy(cur->swh.magic.magic,"S2SUSP....",10);
220                 else panic("\nSwapspace is not swapspace (%.10s)\n", cur->swh.magic.magic);
221                 cur->link.next = prev; /* prev is the first/last swap page of the resume area */
222                 /* link.next lies *no more* in last 4/8 bytes of magic */
223         }
224         rw_swap_page_sync(WRITE, entry, page);
225         __free_page(page);
226 }
227
228 static void read_swapfiles(void) /* This is called before saving image */
229 {
230         int i, len;
231         
232         len=strlen(resume_file);
233         root_swap = 0xFFFF;
234         
235         swap_list_lock();
236         for(i=0; i<MAX_SWAPFILES; i++) {
237                 if (swap_info[i].flags == 0) {
238                         swapfile_used[i]=SWAPFILE_UNUSED;
239                 } else {
240                         if(!len) {
241                                 printk(KERN_WARNING "resume= option should be used to set suspend device" );
242                                 if(root_swap == 0xFFFF) {
243                                         swapfile_used[i] = SWAPFILE_SUSPEND;
244                                         root_swap = i;
245                                 } else
246                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
247                         } else {
248                                 /* we ignore all swap devices that are not the resume_file */
249                                 if (1) {
250 // FIXME                                if(resume_device == swap_info[i].swap_device) {
251                                         swapfile_used[i] = SWAPFILE_SUSPEND;
252                                         root_swap = i;
253                                 } else {
254 #if 0
255                                         printk( "Resume: device %s (%x != %x) ignored\n", swap_info[i].swap_file->d_name.name, swap_info[i].swap_device, resume_device );                                 
256 #endif
257                                         swapfile_used[i] = SWAPFILE_IGNORED;
258                                 }
259                         }
260                 }
261         }
262         swap_list_unlock();
263 }
264
265 static void lock_swapdevices(void) /* This is called after saving image so modification
266                                       will be lost after resume... and that's what we want. */
267 {
268         int i;
269
270         swap_list_lock();
271         for(i = 0; i< MAX_SWAPFILES; i++)
272                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
273                         swap_info[i].flags ^= 0xFF; /* we make the device unusable. A new call to
274                                                        lock_swapdevices can unlock the devices. */
275                 }
276         swap_list_unlock();
277 }
278
279 /**
280  *    write_suspend_image - Write entire image to disk.
281  *
282  *    After writing suspend signature to the disk, suspend may no
283  *    longer fail: we have ready-to-run image in swap, and rollback
284  *    would happen on next reboot -- corrupting data.
285  *
286  *    Note: The buffer we allocate to use to write the suspend header is
287  *    not freed; its not needed since the system is going down anyway
288  *    (plus it causes an oops and I'm lazy^H^H^H^Htoo busy).
289  */
290 static int write_suspend_image(void)
291 {
292         int i;
293         swp_entry_t entry, prev = { 0 };
294         int nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
295         union diskpage *cur,  *buffer = (union diskpage *)get_zeroed_page(GFP_ATOMIC);
296         unsigned long address;
297         struct page *page;
298
299         if (!buffer)
300                 return -ENOMEM;
301
302         printk( "Writing data to swap (%d pages): ", nr_copy_pages );
303         for (i=0; i<nr_copy_pages; i++) {
304                 if (!(i%100))
305                         printk( "." );
306                 if (!(entry = get_swap_page()).val)
307                         panic("\nNot enough swapspace when writing data" );
308                 
309                 if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
310                         panic("\nPage %d: not enough swapspace on suspend device", i );
311             
312                 address = (pagedir_nosave+i)->address;
313                 page = virt_to_page(address);
314                 rw_swap_page_sync(WRITE, entry, page);
315                 (pagedir_nosave+i)->swap_address = entry;
316         }
317         printk( "|\n" );
318         printk( "Writing pagedir (%d pages): ", nr_pgdir_pages);
319         for (i=0; i<nr_pgdir_pages; i++) {
320                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
321                 BUG_ON ((char *) cur != (((char *) pagedir_nosave) + i*PAGE_SIZE));
322                 printk( "." );
323                 if (!(entry = get_swap_page()).val) {
324                         printk(KERN_CRIT "Not enough swapspace when writing pgdir\n" );
325                         panic("Don't know how to recover");
326                         free_page((unsigned long) buffer);
327                         return -ENOSPC;
328                 }
329
330                 if(swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
331                         panic("\nNot enough swapspace for pagedir on suspend device" );
332
333                 BUG_ON (sizeof(swp_entry_t) != sizeof(long));
334                 BUG_ON (PAGE_SIZE % sizeof(struct pbe));
335
336                 cur->link.next = prev;                          
337                 page = virt_to_page((unsigned long)cur);
338                 rw_swap_page_sync(WRITE, entry, page);
339                 prev = entry;
340         }
341         printk("H");
342         BUG_ON (sizeof(struct suspend_header) > PAGE_SIZE-sizeof(swp_entry_t));
343         BUG_ON (sizeof(union diskpage) != PAGE_SIZE);
344         BUG_ON (sizeof(struct link) != PAGE_SIZE);
345         if (!(entry = get_swap_page()).val)
346                 panic( "\nNot enough swapspace when writing header" );
347         if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
348                 panic("\nNot enough swapspace for header on suspend device" );
349
350         cur = (void *) buffer;
351         if (fill_suspend_header(&cur->sh))
352                 panic("\nOut of memory while writing header");
353                 
354         cur->link.next = prev;
355
356         page = virt_to_page((unsigned long)cur);
357         rw_swap_page_sync(WRITE, entry, page);
358         prev = entry;
359
360         printk( "S" );
361         mark_swapfiles(prev, MARK_SWAP_SUSPEND);
362         printk( "|\n" );
363
364         MDELAY(1000);
365         return 0;
366 }
367
368 /* if pagedir_p != NULL it also copies the counted pages */
369 static int count_and_copy_data_pages(struct pbe *pagedir_p)
370 {
371         int chunk_size;
372         int nr_copy_pages = 0;
373         int pfn;
374         struct page *page;
375         
376 #ifdef CONFIG_DISCONTIGMEM
377         panic("Discontingmem not supported");
378 #else
379         BUG_ON (max_pfn != num_physpages);
380 #endif
381         for (pfn = 0; pfn < max_pfn; pfn++) {
382                 page = pfn_to_page(pfn);
383                 if (PageHighMem(page))
384                         panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
385
386                 if (!PageReserved(page)) {
387                         if (PageNosave(page))
388                                 continue;
389
390                         if ((chunk_size=is_head_of_free_region(page))!=0) {
391                                 pfn += chunk_size - 1;
392                                 continue;
393                         }
394                 } else if (PageReserved(page)) {
395                         BUG_ON (PageNosave(page));
396
397                         /*
398                          * Just copy whole code segment. Hopefully it is not that big.
399                          */
400                         if ((ADDRESS(pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && 
401                             (ADDRESS(pfn) <  (unsigned long) ADDRESS2(&__nosave_end))) {
402                                 PRINTK("[nosave %lx]", ADDRESS(pfn));
403                                 continue;
404                         }
405                         /* Hmm, perhaps copying all reserved pages is not too healthy as they may contain 
406                            critical bios data? */
407                 } else  BUG();
408
409                 nr_copy_pages++;
410                 if (pagedir_p) {
411                         pagedir_p->orig_address = ADDRESS(pfn);
412                         copy_page((void *) pagedir_p->address, (void *) pagedir_p->orig_address);
413                         pagedir_p++;
414                 }
415         }
416         return nr_copy_pages;
417 }
418
419 static void free_suspend_pagedir(unsigned long this_pagedir)
420 {
421         struct page *page;
422         int pfn;
423         unsigned long this_pagedir_end = this_pagedir +
424                 (PAGE_SIZE << pagedir_order);
425
426         for(pfn = 0; pfn < num_physpages; pfn++) {
427                 page = pfn_to_page(pfn);
428                 if (!TestClearPageNosave(page))
429                         continue;
430
431                 if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
432                         continue; /* old pagedir gets freed in one */
433                 
434                 free_page(ADDRESS(pfn));
435         }
436         free_pages(this_pagedir, pagedir_order);
437 }
438
439 static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
440 {
441         int i;
442         suspend_pagedir_t *pagedir;
443         struct pbe *p;
444         struct page *page;
445
446         pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
447
448         p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
449         if(!pagedir)
450                 return NULL;
451
452         page = virt_to_page(pagedir);
453         for(i=0; i < 1<<pagedir_order; i++)
454                 SetPageNosave(page++);
455                 
456         while(nr_copy_pages--) {
457                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
458                 if(!p->address) {
459                         free_suspend_pagedir((unsigned long) pagedir);
460                         return NULL;
461                 }
462                 SetPageNosave(virt_to_page(p->address));
463                 p->orig_address = 0;
464                 p++;
465         }
466         return pagedir;
467 }
468
469 static int prepare_suspend_processes(void)
470 {
471         sys_sync();     /* Syncing needs pdflushd, so do it before stopping processes */
472         if (freeze_processes()) {
473                 printk( KERN_ERR "Suspend failed: Not all processes stopped!\n" );
474                 thaw_processes();
475                 return 1;
476         }
477         return 0;
478 }
479
480 /*
481  * Try to free as much memory as possible, but do not OOM-kill anyone
482  *
483  * Notice: all userland should be stopped at this point, or livelock is possible.
484  */
485 static void free_some_memory(void)
486 {
487         printk("Freeing memory: ");
488         while (shrink_all_memory(10000))
489                 printk(".");
490         printk("|\n");
491 }
492
493 static int suspend_prepare_image(void)
494 {
495         struct sysinfo i;
496         unsigned int nr_needed_pages = 0;
497
498         drain_local_pages();
499
500         pagedir_nosave = NULL;
501         printk( "/critical section: Counting pages to copy" );
502         nr_copy_pages = count_and_copy_data_pages(NULL);
503         nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
504         
505         printk(" (pages needed: %d+%d=%d free: %d)\n",nr_copy_pages,PAGES_FOR_IO,nr_needed_pages,nr_free_pages());
506         if(nr_free_pages() < nr_needed_pages) {
507                 printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
508                        name_suspend, nr_needed_pages-nr_free_pages());
509                 root_swap = 0xFFFF;
510                 return 1;
511         }
512         si_swapinfo(&i);        /* FIXME: si_swapinfo(&i) returns all swap devices information.
513                                    We should only consider resume_device. */
514         if (i.freeswap < nr_needed_pages)  {
515                 printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
516                        name_suspend, nr_needed_pages-i.freeswap);
517                 return 1;
518         }
519
520         PRINTK( "Alloc pagedir\n" ); 
521         pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
522         if(!pagedir_nosave) {
523                 /* Shouldn't happen */
524                 printk(KERN_CRIT "%sCouldn't allocate enough pages\n",name_suspend);
525                 panic("Really should not happen");
526                 return 1;
527         }
528         nr_copy_pages_check = nr_copy_pages;
529         pagedir_order_check = pagedir_order;
530
531         drain_local_pages();    /* During allocating of suspend pagedir, new cold pages may appear. Kill them */
532         if (nr_copy_pages != count_and_copy_data_pages(pagedir_nosave)) /* copy */
533                 BUG();
534
535         /*
536          * End of critical section. From now on, we can write to memory,
537          * but we should not touch disk. This specially means we must _not_
538          * touch swap space! Except we must write out our image of course.
539          */
540
541         printk( "critical section/: done (%d pages copied)\n", nr_copy_pages );
542         return 0;
543 }
544
545 static void suspend_save_image(void)
546 {
547         device_resume();
548
549         lock_swapdevices();
550         write_suspend_image();
551         lock_swapdevices();     /* This will unlock ignored swap devices since writing is finished */
552
553         /* It is important _NOT_ to umount filesystems at this point. We want
554          * them synced (in case something goes wrong) but we DO not want to mark
555          * filesystem clean: it is not. (And it does not matter, if we resume
556          * correctly, we'll mark system clean, anyway.)
557          */
558 }
559
560 static void suspend_power_down(void)
561 {
562         extern int C_A_D;
563         C_A_D = 0;
564         printk(KERN_EMERG "%s%s Trying to power down.\n", name_suspend, TEST_SWSUSP ? "Disable TEST_SWSUSP. NOT ": "");
565 #ifdef CONFIG_VT
566         PRINTK(KERN_EMERG "shift_state: %04x\n", shift_state);
567         mdelay(1000);
568         if (TEST_SWSUSP ^ (!!(shift_state & (1 << KG_CTRL))))
569                 machine_restart(NULL);
570         else
571 #endif
572         {
573                 device_shutdown();
574                 machine_power_off();
575         }
576
577         printk(KERN_EMERG "%sProbably not capable for powerdown. System halted.\n", name_suspend);
578         machine_halt();
579         while (1);
580         /* NOTREACHED */
581 }
582
583 /*
584  * Magic happens here
585  */
586
587 void do_magic_resume_1(void)
588 {
589         barrier();
590         mb();
591         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
592
593         device_power_down(4);
594         PRINTK( "Waiting for DMAs to settle down...\n");
595         mdelay(1000);   /* We do not want some readahead with DMA to corrupt our memory, right?
596                            Do it with disabled interrupts for best effect. That way, if some
597                            driver scheduled DMA, we have good chance for DMA to finish ;-). */
598 }
599
600 void do_magic_resume_2(void)
601 {
602         BUG_ON (nr_copy_pages_check != nr_copy_pages);
603         BUG_ON (pagedir_order_check != pagedir_order);
604
605         __flush_tlb_global();           /* Even mappings of "global" things (vmalloc) need to be fixed */
606
607         PRINTK( "Freeing prev allocated pagedir\n" );
608         free_suspend_pagedir((unsigned long) pagedir_save);
609         device_power_up();
610         spin_unlock_irq(&suspend_pagedir_lock);
611         device_resume();
612
613         acquire_console_sem();
614         update_screen(fg_console);      /* Hmm, is this the problem? */
615         release_console_sem();
616
617         PRINTK( "Fixing swap signatures... " );
618         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
619         PRINTK( "ok\n" );
620
621 #ifdef SUSPEND_CONSOLE
622         update_screen(fg_console);      /* Hmm, is this the problem? */
623 #endif
624 }
625
626 /* do_magic() is implemented in arch/?/kernel/suspend_asm.S, and basically does:
627
628         if (!resume) {
629                 do_magic_suspend_1();
630                 save_processor_state();
631                 SAVE_REGISTERS
632                 do_magic_suspend_2();
633                 return;
634         }
635         GO_TO_SWAPPER_PAGE_TABLES
636         do_magic_resume_1();
637         COPY_PAGES_BACK
638         RESTORE_REGISTERS
639         restore_processor_state();
640         do_magic_resume_2();
641
642  */
643
644 void do_magic_suspend_1(void)
645 {
646         mb();
647         barrier();
648         BUG_ON(in_atomic());
649         spin_lock_irq(&suspend_pagedir_lock);
650 }
651
652 void do_magic_suspend_2(void)
653 {
654         int is_problem;
655         read_swapfiles();
656         device_power_down(4);
657         is_problem = suspend_prepare_image();
658         device_power_up();
659         spin_unlock_irq(&suspend_pagedir_lock);
660         if (!is_problem) {
661                 kernel_fpu_end();       /* save_processor_state() does kernel_fpu_begin, and we need to revert it in order to pass in_atomic() checks */
662                 BUG_ON(in_atomic());
663                 suspend_save_image();
664                 suspend_power_down();   /* FIXME: if suspend_power_down is commented out, console is lost after few suspends ?! */
665         }
666
667         printk(KERN_EMERG "%sSuspend failed, trying to recover...\n", name_suspend);
668         MDELAY(1000); /* So user can wait and report us messages if armageddon comes :-) */
669
670         barrier();
671         mb();
672         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
673         mdelay(1000);
674
675         free_pages((unsigned long) pagedir_nosave, pagedir_order);
676         spin_unlock_irq(&suspend_pagedir_lock);
677         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
678 }
679
680 /*
681  * This is main interface to the outside world. It needs to be
682  * called from process context.
683  */
684 int software_suspend(void)
685 {
686         int res;
687         if (!software_suspend_enabled)
688                 return -EAGAIN;
689
690         software_suspend_enabled = 0;
691         might_sleep();
692
693         if (arch_prepare_suspend()) {
694                 printk("%sArchitecture failed to prepare\n", name_suspend);
695                 return -EPERM;
696         }               
697         if (pm_prepare_console())
698                 printk( "%sCan't allocate a console... proceeding\n", name_suspend);
699         if (!prepare_suspend_processes()) {
700
701                 /* At this point, all user processes and "dangerous"
702                    kernel threads are stopped. Free some memory, as we
703                    need half of memory free. */
704
705                 free_some_memory();
706                 
707                 /* No need to invalidate any vfsmnt list -- 
708                  * they will be valid after resume, anyway.
709                  */
710                 blk_run_queues();
711
712                 /* Save state of all device drivers, and stop them. */             
713                 if ((res = device_suspend(4))==0)
714                         /* If stopping device drivers worked, we proceed basically into
715                          * suspend_save_image.
716                          *
717                          * do_magic(0) returns after system is resumed.
718                          *
719                          * do_magic() copies all "used" memory to "free" memory, then
720                          * unsuspends all device drivers, and writes memory to disk
721                          * using normal kernel mechanism.
722                          */
723                         do_magic(0);
724                 thaw_processes();
725         } else
726                 res = -EBUSY;
727         software_suspend_enabled = 1;
728         MDELAY(1000);
729         pm_restore_console();
730         return res;
731 }
732
733 /* More restore stuff */
734
735 /* FIXME: Why not memcpy(to, from, 1<<pagedir_order*PAGE_SIZE)? */
736 static void copy_pagedir(suspend_pagedir_t *to, suspend_pagedir_t *from)
737 {
738         int i;
739         char *topointer=(char *)to, *frompointer=(char *)from;
740
741         for(i=0; i < 1 << pagedir_order; i++) {
742                 copy_page(topointer, frompointer);
743                 topointer += PAGE_SIZE;
744                 frompointer += PAGE_SIZE;
745         }
746 }
747
748 #define does_collide(addr) does_collide_order(pagedir_nosave, addr, 0)
749
750 /*
751  * Returns true if given address/order collides with any orig_address 
752  */
753 static int does_collide_order(suspend_pagedir_t *pagedir, unsigned long addr,
754                 int order)
755 {
756         int i;
757         unsigned long addre = addr + (PAGE_SIZE<<order);
758         
759         for(i=0; i < nr_copy_pages; i++)
760                 if((pagedir+i)->orig_address >= addr &&
761                         (pagedir+i)->orig_address < addre)
762                         return 1;
763
764         return 0;
765 }
766
767 /*
768  * We check here that pagedir & pages it points to won't collide with pages
769  * where we're going to restore from the loaded pages later
770  */
771 static int check_pagedir(void)
772 {
773         int i;
774
775         for(i=0; i < nr_copy_pages; i++) {
776                 unsigned long addr;
777
778                 do {
779                         addr = get_zeroed_page(GFP_ATOMIC);
780                         if(!addr)
781                                 return -ENOMEM;
782                 } while (does_collide(addr));
783
784                 (pagedir_nosave+i)->address = addr;
785         }
786         return 0;
787 }
788
789 static int relocate_pagedir(void)
790 {
791         /*
792          * We have to avoid recursion (not to overflow kernel stack),
793          * and that's why code looks pretty cryptic 
794          */
795         suspend_pagedir_t *new_pagedir, *old_pagedir = pagedir_nosave;
796         void **eaten_memory = NULL;
797         void **c = eaten_memory, *m, *f;
798
799         printk("Relocating pagedir");
800
801         if(!does_collide_order(old_pagedir, (unsigned long)old_pagedir, pagedir_order)) {
802                 printk("not necessary\n");
803                 return 0;
804         }
805
806         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order))) {
807                 memset(m, 0, PAGE_SIZE);
808                 if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order))
809                         break;
810                 eaten_memory = m;
811                 printk( "." ); 
812                 *eaten_memory = c;
813                 c = eaten_memory;
814         }
815
816         if (!m)
817                 return -ENOMEM;
818
819         pagedir_nosave = new_pagedir = m;
820         copy_pagedir(new_pagedir, old_pagedir);
821
822         c = eaten_memory;
823         while(c) {
824                 printk(":");
825                 f = *c;
826                 c = *c;
827                 if (f)
828                         free_pages((unsigned long)f, pagedir_order);
829         }
830         printk("|\n");
831         return 0;
832 }
833
834 /*
835  * Sanity check if this image makes sense with this kernel/swap context
836  * I really don't think that it's foolproof but more than nothing..
837  */
838
839 static int sanity_check_failed(char *reason)
840 {
841         printk(KERN_ERR "%s%s\n", name_resume, reason);
842         return -EPERM;
843 }
844
845 static int sanity_check(struct suspend_header *sh)
846 {
847         if (sh->version_code != LINUX_VERSION_CODE)
848                 return sanity_check_failed("Incorrect kernel version");
849         if (sh->num_physpages != num_physpages)
850                 return sanity_check_failed("Incorrect memory size");
851         if (strncmp(sh->machine, system_utsname.machine, 8))
852                 return sanity_check_failed("Incorrect machine type");
853         if (strncmp(sh->version, system_utsname.version, 20))
854                 return sanity_check_failed("Incorrect version");
855         if (sh->num_cpus != num_online_cpus())
856                 return sanity_check_failed("Incorrect number of cpus");
857         if (sh->page_size != PAGE_SIZE)
858                 return sanity_check_failed("Incorrect PAGE_SIZE");
859         return 0;
860 }
861
862 static int bdev_read_page(struct block_device *bdev, long pos, void *buf)
863 {
864         struct buffer_head *bh;
865         BUG_ON (pos%PAGE_SIZE);
866         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
867         if (!bh || (!bh->b_data)) {
868                 return -1;
869         }
870         memcpy(buf, bh->b_data, PAGE_SIZE);     /* FIXME: may need kmap() */
871         BUG_ON(!buffer_uptodate(bh));
872         brelse(bh);
873         return 0;
874
875
876 static int bdev_write_page(struct block_device *bdev, long pos, void *buf)
877 {
878 #if 0
879         struct buffer_head *bh;
880         BUG_ON (pos%PAGE_SIZE);
881         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
882         if (!bh || (!bh->b_data)) {
883                 return -1;
884         }
885         memcpy(bh->b_data, buf, PAGE_SIZE);     /* FIXME: may need kmap() */
886         BUG_ON(!buffer_uptodate(bh));
887         generic_make_request(WRITE, bh);
888         if (!buffer_uptodate(bh))
889                 printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unsuccessful...\n", name_resume, resume_file);
890         wait_on_buffer(bh);
891         brelse(bh);
892         return 0;
893 #endif
894         printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unimplemented...\n", name_resume, resume_file);
895         return 0;
896 }
897
898 extern dev_t __init name_to_dev_t(const char *line);
899
900 static int __init __read_suspend_image(struct block_device *bdev, union diskpage *cur, int noresume)
901 {
902         swp_entry_t next;
903         int i, nr_pgdir_pages;
904
905 #define PREPARENEXT \
906         {       next = cur->link.next; \
907                 next.val = swp_offset(next) * PAGE_SIZE; \
908         }
909
910         if (bdev_read_page(bdev, 0, cur)) return -EIO;
911
912         if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)) ||
913             (!memcmp("SWAPSPACE2",cur->swh.magic.magic,10))) {
914                 printk(KERN_ERR "%sThis is normal swap space\n", name_resume );
915                 return -EINVAL;
916         }
917
918         PREPARENEXT; /* We have to read next position before we overwrite it */
919
920         if (!memcmp("S1",cur->swh.magic.magic,2))
921                 memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
922         else if (!memcmp("S2",cur->swh.magic.magic,2))
923                 memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
924         else {
925                 if (noresume)
926                         return -EINVAL;
927                 panic("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
928                         name_resume, cur->swh.magic.magic);
929         }
930         if (noresume) {
931                 /* We don't do a sanity check here: we want to restore the swap
932                    whatever version of kernel made the suspend image;
933                    We need to write swap, but swap is *not* enabled so
934                    we must write the device directly */
935                 printk("%s: Fixing swap signatures %s...\n", name_resume, resume_file);
936                 bdev_write_page(bdev, 0, cur);
937         }
938
939         printk( "%sSignature found, resuming\n", name_resume );
940         MDELAY(1000);
941
942         if (bdev_read_page(bdev, next.val, cur)) return -EIO;
943         if (sanity_check(&cur->sh))     /* Is this same machine? */     
944                 return -EPERM;
945         PREPARENEXT;
946
947         pagedir_save = cur->sh.suspend_pagedir;
948         nr_copy_pages = cur->sh.num_pbes;
949         nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
950         pagedir_order = get_bitmask_order(nr_pgdir_pages);
951
952         pagedir_nosave = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC, pagedir_order);
953         if (!pagedir_nosave)
954                 return -ENOMEM;
955
956         PRINTK( "%sReading pagedir, ", name_resume );
957
958         /* We get pages in reverse order of saving! */
959         for (i=nr_pgdir_pages-1; i>=0; i--) {
960                 BUG_ON (!next.val);
961                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
962                 if (bdev_read_page(bdev, next.val, cur)) return -EIO;
963                 PREPARENEXT;
964         }
965         BUG_ON (next.val);
966
967         if (relocate_pagedir())
968                 return -ENOMEM;
969         if (check_pagedir())
970                 return -ENOMEM;
971
972         printk( "Reading image data (%d pages): ", nr_copy_pages );
973         for(i=0; i < nr_copy_pages; i++) {
974                 swp_entry_t swap_address = (pagedir_nosave+i)->swap_address;
975                 if (!(i%100))
976                         printk( "." );
977                 /* You do not need to check for overlaps...
978                    ... check_pagedir already did this work */
979                 if (bdev_read_page(bdev, swp_offset(swap_address) * PAGE_SIZE, (char *)((pagedir_nosave+i)->address)))
980                         return -EIO;
981         }
982         printk( "|\n" );
983         return 0;
984 }
985
986 static int read_suspend_image(const char * specialfile, int noresume)
987 {
988         union diskpage *cur;
989         unsigned long scratch_page = 0;
990         int error;
991         char b[BDEVNAME_SIZE];
992
993         resume_device = name_to_dev_t(specialfile);
994         scratch_page = get_zeroed_page(GFP_ATOMIC);
995         cur = (void *) scratch_page;
996         if (cur) {
997                 struct block_device *bdev;
998                 printk("Resuming from device %s\n",
999                                 __bdevname(resume_device, b));
1000                 bdev = open_by_devnum(resume_device, FMODE_READ);
1001                 if (IS_ERR(bdev)) {
1002                         error = PTR_ERR(bdev);
1003                 } else {
1004                         set_blocksize(bdev, PAGE_SIZE);
1005                         error = __read_suspend_image(bdev, cur, noresume);
1006                         blkdev_put(bdev);
1007                 }
1008         } else error = -ENOMEM;
1009
1010         if (scratch_page)
1011                 free_page(scratch_page);
1012         switch (error) {
1013                 case 0:
1014                         PRINTK("Reading resume file was successful\n");
1015                         break;
1016                 case -EINVAL:
1017                         break;
1018                 case -EIO:
1019                         printk( "%sI/O error\n", name_resume);
1020                         break;
1021                 case -ENOENT:
1022                         printk( "%s%s: No such file or directory\n", name_resume, specialfile);
1023                         break;
1024                 case -ENOMEM:
1025                         printk( "%sNot enough memory\n", name_resume);
1026                         break;
1027                 default:
1028                         printk( "%sError %d resuming\n", name_resume, error );
1029         }
1030         MDELAY(1000);
1031         return error;
1032 }
1033
1034 /**
1035  *      software_resume - Resume from a saved image.
1036  *
1037  *      Called as a late_initcall (so all devices are discovered and 
1038  *      initialized), we call swsusp to see if we have a saved image or not.
1039  *      If so, we quiesce devices, then restore the saved image. We will 
1040  *      return above (in pm_suspend_disk() ) if everything goes well. 
1041  *      Otherwise, we fail gracefully and return to the normally 
1042  *      scheduled program.
1043  *
1044  */
1045 static int __init software_resume(void)
1046 {
1047         if (num_online_cpus() > 1) {
1048                 printk(KERN_WARNING "Software Suspend has malfunctioning SMP support. Disabled :(\n");  
1049                 return -EINVAL;
1050         }
1051         /* We enable the possibility of machine suspend */
1052         software_suspend_enabled = 1;
1053         if (!resume_status)
1054                 return 0;
1055
1056         printk( "%s", name_resume );
1057         if (resume_status == NORESUME) {
1058                 if(resume_file[0])
1059                         read_suspend_image(resume_file, 1);
1060                 printk( "disabled\n" );
1061                 return 0;
1062         }
1063         MDELAY(1000);
1064
1065         if (pm_prepare_console())
1066                 printk("swsusp: Can't allocate a console... proceeding\n");
1067
1068         if (!resume_file[0] && resume_status == RESUME_SPECIFIED) {
1069                 printk( "suspension device unspecified\n" );
1070                 return -EINVAL;
1071         }
1072
1073         printk( "resuming from %s\n", resume_file);
1074         if (read_suspend_image(resume_file, 0))
1075                 goto read_failure;
1076         device_suspend(4);
1077         do_magic(1);
1078         panic("This never returns");
1079
1080 read_failure:
1081         pm_restore_console();
1082         return 0;
1083 }
1084
1085 late_initcall(software_resume);
1086
1087 static int __init resume_setup(char *str)
1088 {
1089         if (resume_status == NORESUME)
1090                 return 1;
1091
1092         strncpy( resume_file, str, 255 );
1093         resume_status = RESUME_SPECIFIED;
1094
1095         return 1;
1096 }
1097
1098 static int __init noresume_setup(char *str)
1099 {
1100         resume_status = NORESUME;
1101         return 1;
1102 }
1103
1104 __setup("noresume", noresume_setup);
1105 __setup("resume=", resume_setup);
1106
1107 EXPORT_SYMBOL(software_suspend);
1108 EXPORT_SYMBOL(software_suspend_enabled);