9a1aa4098e715030a2ddcd62e130ac2b25488f7b
[linux-flexiantxendom0-3.2.10.git] / kernel / suspend.c
1 /*
2  * linux/kernel/suspend.c
3  *
4  * This file is to realize architecture-independent
5  * machine suspend feature using pretty near only high-level routines
6  *
7  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8  * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
9  *
10  * I'd like to thank the following people for their work:
11  * 
12  * Pavel Machek <pavel@ucw.cz>:
13  * Modifications, defectiveness pointing, being with me at the very beginning,
14  * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
15  *
16  * Steve Doddi <dirk@loth.demon.co.uk>: 
17  * Support the possibility of hardware state restoring.
18  *
19  * Raph <grey.havens@earthling.net>:
20  * Support for preserving states of network devices and virtual console
21  * (including X and svgatextmode)
22  *
23  * Kurt Garloff <garloff@suse.de>:
24  * Straightened the critical function in order to prevent compilers from
25  * playing tricks with local variables.
26  *
27  * Andreas Mohr <a.mohr@mailto.de>
28  *
29  * Alex Badea <vampire@go.ro>:
30  * Fixed runaway init
31  *
32  * More state savers are welcome. Especially for the scsi layer...
33  *
34  * For TODOs,FIXMEs also look in Documentation/swsusp.txt
35  */
36
37 #include <linux/module.h>
38 #include <linux/mm.h>
39 #include <linux/suspend.h>
40 #include <linux/smp_lock.h>
41 #include <linux/file.h>
42 #include <linux/utsname.h>
43 #include <linux/version.h>
44 #include <linux/delay.h>
45 #include <linux/reboot.h>
46 #include <linux/vt_kern.h>
47 #include <linux/bitops.h>
48 #include <linux/interrupt.h>
49 #include <linux/kbd_kern.h>
50 #include <linux/keyboard.h>
51 #include <linux/spinlock.h>
52 #include <linux/genhd.h>
53 #include <linux/kernel.h>
54 #include <linux/major.h>
55 #include <linux/swap.h>
56 #include <linux/pm.h>
57 #include <linux/device.h>
58 #include <linux/buffer_head.h>
59 #include <linux/swapops.h>
60 #include <linux/bootmem.h>
61
62 #include <asm/uaccess.h>
63 #include <asm/mmu_context.h>
64 #include <asm/pgtable.h>
65 #include <asm/io.h>
66
67 extern long sys_sync(void);
68
69 unsigned char software_suspend_enabled = 0;
70
71 #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
72 /* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
73    we probably do not take enough locks for switching consoles, etc,
74    so bad things might happen.
75 */
76 #if !defined(CONFIG_VT) || !defined(CONFIG_VT_CONSOLE)
77 #undef SUSPEND_CONSOLE
78 #endif
79
80 #define TIMEOUT (6 * HZ)                        /* Timeout for stopping processes */
81 #define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
82 #define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
83 #define ADDRESS2(x) __ADDRESS(__pa(x))          /* Needed for x86-64 where some pages are in memory twice */
84
85 /* References to section boundaries */
86 extern char __nosave_begin, __nosave_end;
87
88 extern int is_head_of_free_region(struct page *);
89
90 /* Locks */
91 spinlock_t suspend_pagedir_lock __nosavedata = SPIN_LOCK_UNLOCKED;
92
93 /* Variables to be preserved over suspend */
94 static int new_loglevel = 7;
95 static int orig_loglevel = 0;
96 static int orig_fgconsole, orig_kmsg;
97 static int pagedir_order_check;
98 static int nr_copy_pages_check;
99
100 static int resume_status = 0;
101 static char resume_file[256] = "";                      /* For resume= kernel option */
102 static dev_t resume_device;
103 /* Local variables that should not be affected by save */
104 unsigned int nr_copy_pages __nosavedata = 0;
105
106 static int pm_suspend_state = 0;
107
108 /* Suspend pagedir is allocated before final copy, therefore it
109    must be freed after resume 
110
111    Warning: this is evil. There are actually two pagedirs at time of
112    resume. One is "pagedir_save", which is empty frame allocated at
113    time of suspend, that must be freed. Second is "pagedir_nosave", 
114    allocated at time of resume, that travels through memory not to
115    collide with anything.
116  */
117 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
118 static suspend_pagedir_t *pagedir_save;
119 static int pagedir_order __nosavedata = 0;
120
121 struct link {
122         char dummy[PAGE_SIZE - sizeof(swp_entry_t)];
123         swp_entry_t next;
124 };
125
126 union diskpage {
127         union swap_header swh;
128         struct link link;
129         struct suspend_header sh;
130 };
131
132 /*
133  * XXX: We try to keep some more pages free so that I/O operations succeed
134  * without paging. Might this be more?
135  */
136 #define PAGES_FOR_IO    512
137
138 static const char name_suspend[] = "Suspend Machine: ";
139 static const char name_resume[] = "Resume Machine: ";
140
141 /*
142  * Debug
143  */
144 #define DEBUG_DEFAULT
145 #undef  DEBUG_PROCESS
146 #undef  DEBUG_SLOW
147 #define TEST_SWSUSP 0           /* Set to 1 to reboot instead of halt machine after suspension */
148
149 #ifdef DEBUG_DEFAULT
150 # define PRINTK(f, a...)       printk(f, ## a)
151 #else
152 # define PRINTK(f, a...)
153 #endif
154
155 #ifdef DEBUG_SLOW
156 #define MDELAY(a) mdelay(a)
157 #else
158 #define MDELAY(a)
159 #endif
160
161 /*
162  * Refrigerator and related stuff
163  */
164
165 #define INTERESTING(p) \
166                         /* We don't want to touch kernel_threads..*/ \
167                         if (p->flags & PF_IOTHREAD) \
168                                 continue; \
169                         if (p == current) \
170                                 continue; \
171                         if (p->state == TASK_ZOMBIE) \
172                                 continue;
173
174 /* Refrigerator is place where frozen processes are stored :-). */
175 void refrigerator(unsigned long flag)
176 {
177         /* You need correct to work with real-time processes.
178            OTOH, this way one process may see (via /proc/) some other
179            process in stopped state (and thereby discovered we were
180            suspended. We probably do not care. 
181          */
182         long save;
183         save = current->state;
184         current->state = TASK_STOPPED;
185         PRINTK("%s entered refrigerator\n", current->comm);
186         printk("=");
187         current->flags &= ~PF_FREEZE;
188         if (flag)
189                 flush_signals(current); /* We have signaled a kernel thread, which isn't normal behaviour
190                                            and that may lead to 100%CPU sucking because those threads
191                                            just don't manage signals. */
192         current->flags |= PF_FROZEN;
193         while (current->flags & PF_FROZEN)
194                 schedule();
195         PRINTK("%s left refrigerator\n", current->comm);
196         current->state = save;
197 }
198
199 /* 0 = success, else # of processes that we failed to stop */
200 int freeze_processes(void)
201 {
202        int todo;
203        unsigned long start_time;
204         struct task_struct *g, *p;
205         
206         printk( "Stopping tasks: " );
207         start_time = jiffies;
208         do {
209                 todo = 0;
210                 read_lock(&tasklist_lock);
211                 do_each_thread(g, p) {
212                         unsigned long flags;
213                         INTERESTING(p);
214                         if (p->flags & PF_FROZEN)
215                                 continue;
216
217                         /* FIXME: smp problem here: we may not access other process' flags
218                            without locking */
219                         p->flags |= PF_FREEZE;
220                         spin_lock_irqsave(&p->sighand->siglock, flags);
221                         signal_wake_up(p, 0);
222                         spin_unlock_irqrestore(&p->sighand->siglock, flags);
223                         todo++;
224                 } while_each_thread(g, p);
225                 read_unlock(&tasklist_lock);
226                 yield();                        /* Yield is okay here */
227                 if (time_after(jiffies, start_time + TIMEOUT)) {
228                         printk( "\n" );
229                         printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
230                         return todo;
231                 }
232         } while(todo);
233         
234         printk( "|\n" );
235         BUG_ON(in_atomic());
236         return 0;
237 }
238
239 void thaw_processes(void)
240 {
241         struct task_struct *g, *p;
242
243         printk( "Restarting tasks..." );
244         read_lock(&tasklist_lock);
245         do_each_thread(g, p) {
246                 INTERESTING(p);
247                 
248                 if (p->flags & PF_FROZEN) p->flags &= ~PF_FROZEN;
249                 else
250                         printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
251                 wake_up_process(p);
252         } while_each_thread(g, p);
253
254         read_unlock(&tasklist_lock);
255         printk( " done\n" );
256         MDELAY(500);
257 }
258
259 /*
260  * Saving part...
261  */
262
263 static __inline__ int fill_suspend_header(struct suspend_header *sh)
264 {
265         memset((char *)sh, 0, sizeof(*sh));
266
267         sh->version_code = LINUX_VERSION_CODE;
268         sh->num_physpages = num_physpages;
269         strncpy(sh->machine, system_utsname.machine, 8);
270         strncpy(sh->version, system_utsname.version, 20);
271         /* FIXME: Is this bogus? --RR */
272         sh->num_cpus = num_online_cpus();
273         sh->page_size = PAGE_SIZE;
274         sh->suspend_pagedir = pagedir_nosave;
275         BUG_ON (pagedir_save != pagedir_nosave);
276         sh->num_pbes = nr_copy_pages;
277         /* TODO: needed? mounted fs' last mounted date comparison
278          * [so they haven't been mounted since last suspend.
279          * Maybe it isn't.] [we'd need to do this for _all_ fs-es]
280          */
281         return 0;
282 }
283
284 /*
285  * This is our sync function. With this solution we probably won't sleep
286  * but that should not be a problem since tasks are stopped..
287  */
288
289 static inline void do_suspend_sync(void)
290 {
291         blk_run_queues();
292 #warning This might be broken. We need to somehow wait for data to reach the disk
293 }
294
295 /* We memorize in swapfile_used what swap devices are used for suspension */
296 #define SWAPFILE_UNUSED    0
297 #define SWAPFILE_SUSPEND   1    /* This is the suspending device */
298 #define SWAPFILE_IGNORED   2    /* Those are other swap devices ignored for suspension */
299
300 static unsigned short swapfile_used[MAX_SWAPFILES];
301 static unsigned short root_swap;
302 #define MARK_SWAP_SUSPEND 0
303 #define MARK_SWAP_RESUME 2
304
305 static void mark_swapfiles(swp_entry_t prev, int mode)
306 {
307         swp_entry_t entry;
308         union diskpage *cur;
309         struct page *page;
310
311         if (root_swap == 0xFFFF)  /* ignored */
312                 return;
313
314         page = alloc_page(GFP_ATOMIC);
315         if (!page)
316                 panic("Out of memory in mark_swapfiles");
317         cur = page_address(page);
318         /* XXX: this is dirty hack to get first page of swap file */
319         entry = swp_entry(root_swap, 0);
320         rw_swap_page_sync(READ, entry, page);
321
322         if (mode == MARK_SWAP_RESUME) {
323                 if (!memcmp("S1",cur->swh.magic.magic,2))
324                         memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
325                 else if (!memcmp("S2",cur->swh.magic.magic,2))
326                         memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
327                 else printk("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
328                         name_resume, cur->swh.magic.magic);
329         } else {
330                 if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)))
331                         memcpy(cur->swh.magic.magic,"S1SUSP....",10);
332                 else if ((!memcmp("SWAPSPACE2",cur->swh.magic.magic,10)))
333                         memcpy(cur->swh.magic.magic,"S2SUSP....",10);
334                 else panic("\nSwapspace is not swapspace (%.10s)\n", cur->swh.magic.magic);
335                 cur->link.next = prev; /* prev is the first/last swap page of the resume area */
336                 /* link.next lies *no more* in last 4/8 bytes of magic */
337         }
338         rw_swap_page_sync(WRITE, entry, page);
339         __free_page(page);
340 }
341
342 static void read_swapfiles(void) /* This is called before saving image */
343 {
344         int i, len;
345         
346         len=strlen(resume_file);
347         root_swap = 0xFFFF;
348         
349         swap_list_lock();
350         for(i=0; i<MAX_SWAPFILES; i++) {
351                 if (swap_info[i].flags == 0) {
352                         swapfile_used[i]=SWAPFILE_UNUSED;
353                 } else {
354                         if(!len) {
355                                 printk(KERN_WARNING "resume= option should be used to set suspend device" );
356                                 if(root_swap == 0xFFFF) {
357                                         swapfile_used[i] = SWAPFILE_SUSPEND;
358                                         root_swap = i;
359                                 } else
360                                         swapfile_used[i] = SWAPFILE_IGNORED;                              
361                         } else {
362                                 /* we ignore all swap devices that are not the resume_file */
363                                 if (1) {
364 // FIXME                                if(resume_device == swap_info[i].swap_device) {
365                                         swapfile_used[i] = SWAPFILE_SUSPEND;
366                                         root_swap = i;
367                                 } else {
368 #if 0
369                                         printk( "Resume: device %s (%x != %x) ignored\n", swap_info[i].swap_file->d_name.name, swap_info[i].swap_device, resume_device );                                 
370 #endif
371                                         swapfile_used[i] = SWAPFILE_IGNORED;
372                                 }
373                         }
374                 }
375         }
376         swap_list_unlock();
377 }
378
379 static void lock_swapdevices(void) /* This is called after saving image so modification
380                                       will be lost after resume... and that's what we want. */
381 {
382         int i;
383
384         swap_list_lock();
385         for(i = 0; i< MAX_SWAPFILES; i++)
386                 if(swapfile_used[i] == SWAPFILE_IGNORED) {
387                         swap_info[i].flags ^= 0xFF; /* we make the device unusable. A new call to
388                                                        lock_swapdevices can unlock the devices. */
389                 }
390         swap_list_unlock();
391 }
392
393 static int write_suspend_image(void)
394 {
395         int i;
396         swp_entry_t entry, prev = { 0 };
397         int nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
398         union diskpage *cur,  *buffer = (union diskpage *)get_zeroed_page(GFP_ATOMIC);
399         unsigned long address;
400         struct page *page;
401
402         printk( "Writing data to swap (%d pages): ", nr_copy_pages );
403         for (i=0; i<nr_copy_pages; i++) {
404                 if (!(i%100))
405                         printk( "." );
406                 if (!(entry = get_swap_page()).val)
407                         panic("\nNot enough swapspace when writing data" );
408                 
409                 if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
410                         panic("\nPage %d: not enough swapspace on suspend device", i );
411             
412                 address = (pagedir_nosave+i)->address;
413                 page = virt_to_page(address);
414                 rw_swap_page_sync(WRITE, entry, page);
415                 (pagedir_nosave+i)->swap_address = entry;
416         }
417         printk( "|\n" );
418         printk( "Writing pagedir (%d pages): ", nr_pgdir_pages);
419         for (i=0; i<nr_pgdir_pages; i++) {
420                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
421                 BUG_ON ((char *) cur != (((char *) pagedir_nosave) + i*PAGE_SIZE));
422                 printk( "." );
423                 if (!(entry = get_swap_page()).val) {
424                         printk(KERN_CRIT "Not enough swapspace when writing pgdir\n" );
425                         panic("Don't know how to recover");
426                         free_page((unsigned long) buffer);
427                         return -ENOSPC;
428                 }
429
430                 if(swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
431                         panic("\nNot enough swapspace for pagedir on suspend device" );
432
433                 BUG_ON (sizeof(swp_entry_t) != sizeof(long));
434                 BUG_ON (PAGE_SIZE % sizeof(struct pbe));
435
436                 cur->link.next = prev;                          
437                 page = virt_to_page((unsigned long)cur);
438                 rw_swap_page_sync(WRITE, entry, page);
439                 prev = entry;
440         }
441         printk("H");
442         BUG_ON (sizeof(struct suspend_header) > PAGE_SIZE-sizeof(swp_entry_t));
443         BUG_ON (sizeof(union diskpage) != PAGE_SIZE);
444         if (!(entry = get_swap_page()).val)
445                 panic( "\nNot enough swapspace when writing header" );
446         if (swapfile_used[swp_type(entry)] != SWAPFILE_SUSPEND)
447                 panic("\nNot enough swapspace for header on suspend device" );
448
449         cur = (void *) buffer;
450         if (fill_suspend_header(&cur->sh))
451                 panic("\nOut of memory while writing header");
452                 
453         cur->link.next = prev;
454
455         page = virt_to_page((unsigned long)cur);
456         rw_swap_page_sync(WRITE, entry, page);
457         prev = entry;
458
459         printk( "S" );
460         mark_swapfiles(prev, MARK_SWAP_SUSPEND);
461         printk( "|\n" );
462
463         MDELAY(1000);
464         free_page((unsigned long) buffer);
465         return 0;
466 }
467
468 /* if pagedir_p != NULL it also copies the counted pages */
469 static int count_and_copy_data_pages(struct pbe *pagedir_p)
470 {
471         int chunk_size;
472         int nr_copy_pages = 0;
473         int pfn;
474         struct page *page;
475         
476 #ifdef CONFIG_DISCONTIGMEM
477         panic("Discontingmem not supported");
478 #else
479         BUG_ON (max_pfn != num_physpages);
480 #endif
481         for (pfn = 0; pfn < max_pfn; pfn++) {
482                 page = pfn_to_page(pfn);
483                 if (PageHighMem(page))
484                         panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
485
486                 if (!PageReserved(page)) {
487                         if (PageNosave(page))
488                                 continue;
489
490                         if ((chunk_size=is_head_of_free_region(page))!=0) {
491                                 pfn += chunk_size - 1;
492                                 continue;
493                         }
494                 } else if (PageReserved(page)) {
495                         BUG_ON (PageNosave(page));
496
497                         /*
498                          * Just copy whole code segment. Hopefully it is not that big.
499                          */
500                         if ((ADDRESS(pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && 
501                             (ADDRESS(pfn) <  (unsigned long) ADDRESS2(&__nosave_end))) {
502                                 PRINTK("[nosave %lx]", ADDRESS(pfn));
503                                 continue;
504                         }
505                         /* Hmm, perhaps copying all reserved pages is not too healthy as they may contain 
506                            critical bios data? */
507                 } else  BUG();
508
509                 nr_copy_pages++;
510                 if (pagedir_p) {
511                         pagedir_p->orig_address = ADDRESS(pfn);
512                         copy_page((void *) pagedir_p->address, (void *) pagedir_p->orig_address);
513                         pagedir_p++;
514                 }
515         }
516         return nr_copy_pages;
517 }
518
519 static void free_suspend_pagedir(unsigned long this_pagedir)
520 {
521         struct page *page;
522         int pfn;
523         unsigned long this_pagedir_end = this_pagedir +
524                 (PAGE_SIZE << pagedir_order);
525
526         for(pfn = 0; pfn < num_physpages; pfn++) {
527                 page = pfn_to_page(pfn);
528                 if (!TestClearPageNosave(page))
529                         continue;
530
531                 if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
532                         continue; /* old pagedir gets freed in one */
533                 
534                 free_page(ADDRESS(pfn));
535         }
536         free_pages(this_pagedir, pagedir_order);
537 }
538
539 static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
540 {
541         int i;
542         suspend_pagedir_t *pagedir;
543         struct pbe *p;
544         struct page *page;
545
546         pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
547
548         p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
549         if(!pagedir)
550                 return NULL;
551
552         page = virt_to_page(pagedir);
553         for(i=0; i < 1<<pagedir_order; i++)
554                 SetPageNosave(page++);
555                 
556         while(nr_copy_pages--) {
557                 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
558                 if(!p->address) {
559                         free_suspend_pagedir((unsigned long) pagedir);
560                         return NULL;
561                 }
562                 printk(".");
563                 SetPageNosave(virt_to_page(p->address));
564                 p->orig_address = 0;
565                 p++;
566         }
567         return pagedir;
568 }
569
570 static int prepare_suspend_console(void)
571 {
572         orig_loglevel = console_loglevel;
573         console_loglevel = new_loglevel;
574
575 #ifdef CONFIG_VT
576         orig_fgconsole = fg_console;
577 #ifdef SUSPEND_CONSOLE
578         if(vc_allocate(SUSPEND_CONSOLE))
579           /* we can't have a free VC for now. Too bad,
580            * we don't want to mess the screen for now. */
581                 return 1;
582
583         set_console (SUSPEND_CONSOLE);
584         if(vt_waitactive(SUSPEND_CONSOLE)) {
585                 PRINTK("Bummer. Can't switch VCs.");
586                 return 1;
587         }
588         orig_kmsg = kmsg_redirect;
589         kmsg_redirect = SUSPEND_CONSOLE;
590 #endif
591 #endif
592         return 0;
593 }
594
595 static void restore_console(void)
596 {
597         console_loglevel = orig_loglevel;
598 #ifdef SUSPEND_CONSOLE
599         set_console (orig_fgconsole);
600 #endif
601         return;
602 }
603
604 static int prepare_suspend_processes(void)
605 {
606         sys_sync();     /* Syncing needs pdflushd, so do it before stopping processes */
607         if (freeze_processes()) {
608                 printk( KERN_ERR "Suspend failed: Not all processes stopped!\n" );
609                 thaw_processes();
610                 return 1;
611         }
612         return 0;
613 }
614
615 /*
616  * Try to free as much memory as possible, but do not OOM-kill anyone
617  *
618  * Notice: all userland should be stopped at this point, or livelock is possible.
619  */
620 static void free_some_memory(void)
621 {
622         printk("Freeing memory: ");
623         while (shrink_all_memory(10000))
624                 printk(".");
625         printk("|\n");
626 }
627
628 /* Make disk drivers accept operations, again */
629 static void drivers_unsuspend(void)
630 {
631         device_resume(RESUME_RESTORE_STATE);
632         device_resume(RESUME_ENABLE);
633 }
634
635 /* Called from process context */
636 static int drivers_suspend(void)
637 {
638         device_suspend(4, SUSPEND_NOTIFY);
639         device_suspend(4, SUSPEND_SAVE_STATE);
640         device_suspend(4, SUSPEND_DISABLE);
641         if(!pm_suspend_state) {
642                 if(pm_send_all(PM_SUSPEND,(void *)3)) {
643                         printk(KERN_WARNING "Problem while sending suspend event\n");
644                         return(1);
645                 }
646                 pm_suspend_state=1;
647         } else
648                 printk(KERN_WARNING "PM suspend state already raised\n");
649           
650         return(0);
651 }
652
653 #define RESUME_PHASE1 1 /* Called from interrupts disabled */
654 #define RESUME_PHASE2 2 /* Called with interrupts enabled */
655 #define RESUME_ALL_PHASES (RESUME_PHASE1 | RESUME_PHASE2)
656 static void drivers_resume(int flags)
657 {
658         if (flags & RESUME_PHASE1) {
659                 device_resume(RESUME_RESTORE_STATE);
660                 device_resume(RESUME_ENABLE);
661         }
662         if (flags & RESUME_PHASE2) {
663                 if(pm_suspend_state) {
664                         if(pm_send_all(PM_RESUME,(void *)0))
665                                 printk(KERN_WARNING "Problem while sending resume event\n");
666                         pm_suspend_state=0;
667                 } else
668                         printk(KERN_WARNING "PM suspend state wasn't raised\n");
669
670 #ifdef SUSPEND_CONSOLE
671                 update_screen(fg_console);      /* Hmm, is this the problem? */
672 #endif
673         }
674 }
675
676 static int suspend_prepare_image(void)
677 {
678         struct sysinfo i;
679         unsigned int nr_needed_pages = 0;
680
681         drain_local_pages();
682
683         pagedir_nosave = NULL;
684         printk( "/critical section: Counting pages to copy" );
685         nr_copy_pages = count_and_copy_data_pages(NULL);
686         nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
687         
688         printk(" (pages needed: %d+%d=%d free: %d)\n",nr_copy_pages,PAGES_FOR_IO,nr_needed_pages,nr_free_pages());
689         if(nr_free_pages() < nr_needed_pages) {
690                 printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
691                        name_suspend, nr_needed_pages-nr_free_pages());
692                 root_swap = 0xFFFF;
693                 return 1;
694         }
695         si_swapinfo(&i);        /* FIXME: si_swapinfo(&i) returns all swap devices information.
696                                    We should only consider resume_device. */
697         if (i.freeswap < nr_needed_pages)  {
698                 printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
699                        name_suspend, nr_needed_pages-i.freeswap);
700                 return 1;
701         }
702
703         PRINTK( "Alloc pagedir\n" ); 
704         pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
705         if(!pagedir_nosave) {
706                 /* Shouldn't happen */
707                 printk(KERN_CRIT "%sCouldn't allocate enough pages\n",name_suspend);
708                 panic("Really should not happen");
709                 return 1;
710         }
711         nr_copy_pages_check = nr_copy_pages;
712         pagedir_order_check = pagedir_order;
713
714         drain_local_pages();    /* During allocating of suspend pagedir, new cold pages may appear. Kill them */
715         if (nr_copy_pages != count_and_copy_data_pages(pagedir_nosave)) /* copy */
716                 BUG();
717
718         /*
719          * End of critical section. From now on, we can write to memory,
720          * but we should not touch disk. This specially means we must _not_
721          * touch swap space! Except we must write out our image of course.
722          */
723
724         printk( "critical section/: done (%d pages copied)\n", nr_copy_pages );
725         return 0;
726 }
727
728 static void suspend_save_image(void)
729 {
730         drivers_unsuspend();
731
732         lock_swapdevices();
733         write_suspend_image();
734         lock_swapdevices();     /* This will unlock ignored swap devices since writing is finished */
735
736         /* It is important _NOT_ to umount filesystems at this point. We want
737          * them synced (in case something goes wrong) but we DO not want to mark
738          * filesystem clean: it is not. (And it does not matter, if we resume
739          * correctly, we'll mark system clean, anyway.)
740          */
741 }
742
743 static void suspend_power_down(void)
744 {
745         extern int C_A_D;
746         C_A_D = 0;
747         printk(KERN_EMERG "%s%s Trying to power down.\n", name_suspend, TEST_SWSUSP ? "Disable TEST_SWSUSP. NOT ": "");
748 #ifdef CONFIG_VT
749         PRINTK(KERN_EMERG "shift_state: %04x\n", shift_state);
750         mdelay(1000);
751         if (TEST_SWSUSP ^ (!!(shift_state & (1 << KG_CTRL))))
752                 machine_restart(NULL);
753         else
754 #endif
755         {
756                 device_shutdown();
757                 machine_power_off();
758         }
759
760         printk(KERN_EMERG "%sProbably not capable for powerdown. System halted.\n", name_suspend);
761         machine_halt();
762         while (1);
763         /* NOTREACHED */
764 }
765
766 /*
767  * Magic happens here
768  */
769
770 void do_magic_resume_1(void)
771 {
772         barrier();
773         mb();
774         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
775
776         PRINTK( "Waiting for DMAs to settle down...\n");
777         mdelay(1000);   /* We do not want some readahead with DMA to corrupt our memory, right?
778                            Do it with disabled interrupts for best effect. That way, if some
779                            driver scheduled DMA, we have good chance for DMA to finish ;-). */
780 }
781
782 void do_magic_resume_2(void)
783 {
784         BUG_ON (nr_copy_pages_check != nr_copy_pages);
785         BUG_ON (pagedir_order_check != pagedir_order);
786
787         __flush_tlb_global();           /* Even mappings of "global" things (vmalloc) need to be fixed */
788
789         PRINTK( "Freeing prev allocated pagedir\n" );
790         free_suspend_pagedir((unsigned long) pagedir_save);
791         spin_unlock_irq(&suspend_pagedir_lock);
792         drivers_resume(RESUME_ALL_PHASES);
793
794         PRINTK( "Fixing swap signatures... " );
795         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
796         PRINTK( "ok\n" );
797
798 #ifdef SUSPEND_CONSOLE
799         update_screen(fg_console);      /* Hmm, is this the problem? */
800 #endif
801 }
802
803 /* do_magic() is implemented in arch/?/kernel/suspend_asm.S, and basically does:
804
805         if (!resume) {
806                 do_magic_suspend_1();
807                 save_processor_state();
808                 SAVE_REGISTERS
809                 do_magic_suspend_2();
810                 return;
811         }
812         GO_TO_SWAPPER_PAGE_TABLES
813         do_magic_resume_1();
814         COPY_PAGES_BACK
815         RESTORE_REGISTERS
816         restore_processor_state();
817         do_magic_resume_2();
818
819  */
820
821 void do_magic_suspend_1(void)
822 {
823         mb();
824         barrier();
825         BUG_ON(in_atomic());
826         spin_lock_irq(&suspend_pagedir_lock);
827 }
828
829 void do_magic_suspend_2(void)
830 {
831         int is_problem;
832         read_swapfiles();
833         is_problem = suspend_prepare_image();
834         spin_unlock_irq(&suspend_pagedir_lock);
835         if (!is_problem) {
836                 kernel_fpu_end();       /* save_processor_state() does kernel_fpu_begin, and we need to revert it in order to pass in_atomic() checks */
837                 BUG_ON(in_atomic());
838                 suspend_save_image();
839                 suspend_power_down();   /* FIXME: if suspend_power_down is commented out, console is lost after few suspends ?! */
840         }
841
842         printk(KERN_EMERG "%sSuspend failed, trying to recover...\n", name_suspend);
843         MDELAY(1000); /* So user can wait and report us messages if armageddon comes :-) */
844
845         barrier();
846         mb();
847         spin_lock_irq(&suspend_pagedir_lock);   /* Done to disable interrupts */ 
848         mdelay(1000);
849
850         free_pages((unsigned long) pagedir_nosave, pagedir_order);
851         spin_unlock_irq(&suspend_pagedir_lock);
852         mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
853         PRINTK(KERN_WARNING "%sLeaving do_magic_suspend_2...\n", name_suspend); 
854 }
855
856 static void do_software_suspend(void)
857 {
858         arch_prepare_suspend();
859         if (prepare_suspend_console())
860                 printk( "%sCan't allocate a console... proceeding\n", name_suspend);
861         if (!prepare_suspend_processes()) {
862
863                 /* At this point, all user processes and "dangerous"
864                    kernel threads are stopped. Free some memory, as we
865                    need half of memory free. */
866
867                 free_some_memory();
868                 
869                 /* No need to invalidate any vfsmnt list -- they will be valid after resume, anyway.
870                  *
871                  * We sync here -- so you have consistent filesystem state when things go wrong.
872                  * -- so that noone writes to disk after we do atomic copy of data.
873                  */
874                 PRINTK("Syncing disks before copy\n");
875                 do_suspend_sync();
876
877                 /* Save state of all device drivers, and stop them. */             
878                 if(drivers_suspend()==0)
879                         /* If stopping device drivers worked, we proceed basically into
880                          * suspend_save_image.
881                          *
882                          * do_magic(0) returns after system is resumed.
883                          *
884                          * do_magic() copies all "used" memory to "free" memory, then
885                          * unsuspends all device drivers, and writes memory to disk
886                          * using normal kernel mechanism.
887                          */
888                         do_magic(0);
889                 PRINTK("Restarting processes...\n");
890                 thaw_processes();
891         }
892         software_suspend_enabled = 1;
893         MDELAY(1000);
894         restore_console ();
895 }
896
897 /*
898  * This is main interface to the outside world. It needs to be
899  * called from process context.
900  */
901 void software_suspend(void)
902 {
903         if(!software_suspend_enabled)
904                 return;
905
906         software_suspend_enabled = 0;
907         BUG_ON(in_interrupt());
908         do_software_suspend();
909 }
910
911 /* More restore stuff */
912
913 /* FIXME: Why not memcpy(to, from, 1<<pagedir_order*PAGE_SIZE)? */
914 static void copy_pagedir(suspend_pagedir_t *to, suspend_pagedir_t *from)
915 {
916         int i;
917         char *topointer=(char *)to, *frompointer=(char *)from;
918
919         for(i=0; i < 1 << pagedir_order; i++) {
920                 copy_page(topointer, frompointer);
921                 topointer += PAGE_SIZE;
922                 frompointer += PAGE_SIZE;
923         }
924 }
925
926 #define does_collide(addr) does_collide_order(pagedir_nosave, addr, 0)
927
928 /*
929  * Returns true if given address/order collides with any orig_address 
930  */
931 static int does_collide_order(suspend_pagedir_t *pagedir, unsigned long addr,
932                 int order)
933 {
934         int i;
935         unsigned long addre = addr + (PAGE_SIZE<<order);
936         
937         for(i=0; i < nr_copy_pages; i++)
938                 if((pagedir+i)->orig_address >= addr &&
939                         (pagedir+i)->orig_address < addre)
940                         return 1;
941
942         return 0;
943 }
944
945 /*
946  * We check here that pagedir & pages it points to won't collide with pages
947  * where we're going to restore from the loaded pages later
948  */
949 static int check_pagedir(void)
950 {
951         int i;
952
953         for(i=0; i < nr_copy_pages; i++) {
954                 unsigned long addr;
955
956                 do {
957                         addr = get_zeroed_page(GFP_ATOMIC);
958                         if(!addr)
959                                 return -ENOMEM;
960                 } while (does_collide(addr));
961
962                 (pagedir_nosave+i)->address = addr;
963         }
964         return 0;
965 }
966
967 static int relocate_pagedir(void)
968 {
969         /*
970          * We have to avoid recursion (not to overflow kernel stack),
971          * and that's why code looks pretty cryptic 
972          */
973         suspend_pagedir_t *new_pagedir, *old_pagedir = pagedir_nosave;
974         void **eaten_memory = NULL;
975         void **c = eaten_memory, *m, *f;
976
977         printk("Relocating pagedir");
978
979         if(!does_collide_order(old_pagedir, (unsigned long)old_pagedir, pagedir_order)) {
980                 printk("not necessary\n");
981                 return 0;
982         }
983
984         while ((m = (void *) __get_free_pages(GFP_ATOMIC, pagedir_order))) {
985                 memset(m, 0, PAGE_SIZE);
986                 if (!does_collide_order(old_pagedir, (unsigned long)m, pagedir_order))
987                         break;
988                 eaten_memory = m;
989                 printk( "." ); 
990                 *eaten_memory = c;
991                 c = eaten_memory;
992         }
993
994         if (!m)
995                 return -ENOMEM;
996
997         pagedir_nosave = new_pagedir = m;
998         copy_pagedir(new_pagedir, old_pagedir);
999
1000         c = eaten_memory;
1001         while(c) {
1002                 printk(":");
1003                 f = *c;
1004                 c = *c;
1005                 if (f)
1006                         free_pages((unsigned long)f, pagedir_order);
1007         }
1008         printk("|\n");
1009         return 0;
1010 }
1011
1012 /*
1013  * Sanity check if this image makes sense with this kernel/swap context
1014  * I really don't think that it's foolproof but more than nothing..
1015  */
1016
1017 static int sanity_check_failed(char *reason)
1018 {
1019         printk(KERN_ERR "%s%s\n",name_resume,reason);
1020         return -EPERM;
1021 }
1022
1023 static int sanity_check(struct suspend_header *sh)
1024 {
1025         if(sh->version_code != LINUX_VERSION_CODE)
1026                 return sanity_check_failed("Incorrect kernel version");
1027         if(sh->num_physpages != num_physpages)
1028                 return sanity_check_failed("Incorrect memory size");
1029         if(strncmp(sh->machine, system_utsname.machine, 8))
1030                 return sanity_check_failed("Incorrect machine type");
1031         if(strncmp(sh->version, system_utsname.version, 20))
1032                 return sanity_check_failed("Incorrect version");
1033         if(sh->num_cpus != num_online_cpus())
1034                 return sanity_check_failed("Incorrect number of cpus");
1035         if(sh->page_size != PAGE_SIZE)
1036                 return sanity_check_failed("Incorrect PAGE_SIZE");
1037         return 0;
1038 }
1039
1040 static int bdev_read_page(struct block_device *bdev, long pos, void *buf)
1041 {
1042         struct buffer_head *bh;
1043         BUG_ON (pos%PAGE_SIZE);
1044         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
1045         if (!bh || (!bh->b_data)) {
1046                 return -1;
1047         }
1048         memcpy(buf, bh->b_data, PAGE_SIZE);     /* FIXME: may need kmap() */
1049         BUG_ON(!buffer_uptodate(bh));
1050         brelse(bh);
1051         return 0;
1052
1053
1054 static int bdev_write_page(struct block_device *bdev, long pos, void *buf)
1055 {
1056 #if 0
1057         struct buffer_head *bh;
1058         BUG_ON (pos%PAGE_SIZE);
1059         bh = __bread(bdev, pos/PAGE_SIZE, PAGE_SIZE);
1060         if (!bh || (!bh->b_data)) {
1061                 return -1;
1062         }
1063         memcpy(bh->b_data, buf, PAGE_SIZE);     /* FIXME: may need kmap() */
1064         BUG_ON(!buffer_uptodate(bh));
1065         generic_make_request(WRITE, bh);
1066         if (!buffer_uptodate(bh))
1067                 printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unsuccessful...\n", name_resume, resume_file);
1068         wait_on_buffer(bh);
1069         brelse(bh);
1070         return 0;
1071 #endif
1072         printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unimplemented...\n", name_resume, resume_file);
1073         return 0;
1074 }
1075
1076 extern dev_t __init name_to_dev_t(const char *line);
1077
1078 static int __read_suspend_image(struct block_device *bdev, union diskpage *cur, int noresume)
1079 {
1080         swp_entry_t next;
1081         int i, nr_pgdir_pages;
1082
1083 #define PREPARENEXT \
1084         {       next = cur->link.next; \
1085                 next.val = swp_offset(next) * PAGE_SIZE; \
1086         }
1087
1088         if (bdev_read_page(bdev, 0, cur)) return -EIO;
1089
1090         if ((!memcmp("SWAP-SPACE",cur->swh.magic.magic,10)) ||
1091             (!memcmp("SWAPSPACE2",cur->swh.magic.magic,10))) {
1092                 printk(KERN_ERR "%sThis is normal swap space\n", name_resume );
1093                 return -EINVAL;
1094         }
1095
1096         PREPARENEXT; /* We have to read next position before we overwrite it */
1097
1098         if (!memcmp("S1",cur->swh.magic.magic,2))
1099                 memcpy(cur->swh.magic.magic,"SWAP-SPACE",10);
1100         else if (!memcmp("S2",cur->swh.magic.magic,2))
1101                 memcpy(cur->swh.magic.magic,"SWAPSPACE2",10);
1102         else {
1103                 if (noresume)
1104                         return -EINVAL;
1105                 panic("%sUnable to find suspended-data signature (%.10s - misspelled?\n", 
1106                         name_resume, cur->swh.magic.magic);
1107         }
1108         if (noresume) {
1109                 /* We don't do a sanity check here: we want to restore the swap
1110                    whatever version of kernel made the suspend image;
1111                    We need to write swap, but swap is *not* enabled so
1112                    we must write the device directly */
1113                 printk("%s: Fixing swap signatures %s...\n", name_resume, resume_file);
1114                 bdev_write_page(bdev, 0, cur);
1115         }
1116
1117         if (prepare_suspend_console())
1118                 printk("%sCan't allocate a console... proceeding\n", name_resume);
1119         printk( "%sSignature found, resuming\n", name_resume );
1120         MDELAY(1000);
1121
1122         if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1123         if (sanity_check(&cur->sh))     /* Is this same machine? */     
1124                 return -EPERM;
1125         PREPARENEXT;
1126
1127         pagedir_save = cur->sh.suspend_pagedir;
1128         nr_copy_pages = cur->sh.num_pbes;
1129         nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
1130         pagedir_order = get_bitmask_order(nr_pgdir_pages);
1131
1132         pagedir_nosave = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC, pagedir_order);
1133         if (!pagedir_nosave)
1134                 return -ENOMEM;
1135
1136         PRINTK( "%sReading pagedir, ", name_resume );
1137
1138         /* We get pages in reverse order of saving! */
1139         for (i=nr_pgdir_pages-1; i>=0; i--) {
1140                 BUG_ON (!next.val);
1141                 cur = (union diskpage *)((char *) pagedir_nosave)+i;
1142                 if (bdev_read_page(bdev, next.val, cur)) return -EIO;
1143                 PREPARENEXT;
1144         }
1145         BUG_ON (next.val);
1146
1147         if (relocate_pagedir())
1148                 return -ENOMEM;
1149         if (check_pagedir())
1150                 return -ENOMEM;
1151
1152         printk( "Reading image data (%d pages): ", nr_copy_pages );
1153         for(i=0; i < nr_copy_pages; i++) {
1154                 swp_entry_t swap_address = (pagedir_nosave+i)->swap_address;
1155                 if (!(i%100))
1156                         printk( "." );
1157                 /* You do not need to check for overlaps...
1158                    ... check_pagedir already did this work */
1159                 if (bdev_read_page(bdev, swp_offset(swap_address) * PAGE_SIZE, (char *)((pagedir_nosave+i)->address)))
1160                         return -EIO;
1161         }
1162         printk( "|\n" );
1163         return 0;
1164 }
1165
1166 static int read_suspend_image(const char * specialfile, int noresume)
1167 {
1168         union diskpage *cur;
1169         unsigned long scratch_page = 0;
1170         int error;
1171         char b[BDEVNAME_SIZE];
1172
1173         resume_device = name_to_dev_t(specialfile);
1174         scratch_page = get_zeroed_page(GFP_ATOMIC);
1175         cur = (void *) scratch_page;
1176         if (cur) {
1177                 struct block_device *bdev;
1178                 printk("Resuming from device %s\n",
1179                                 __bdevname(resume_device, b));
1180                 bdev = open_by_devnum(resume_device, FMODE_READ, BDEV_RAW);
1181                 if (IS_ERR(bdev)) {
1182                         error = PTR_ERR(bdev);
1183                 } else {
1184                         set_blocksize(bdev, PAGE_SIZE);
1185                         error = __read_suspend_image(bdev, cur, noresume);
1186                         blkdev_put(bdev, BDEV_RAW);
1187                 }
1188         } else error = -ENOMEM;
1189
1190         if (scratch_page)
1191                 free_page(scratch_page);
1192         switch (error) {
1193                 case 0:
1194                         PRINTK("Reading resume file was successful\n");
1195                         break;
1196                 case -EINVAL:
1197                         break;
1198                 case -EIO:
1199                         printk( "%sI/O error\n", name_resume);
1200                         break;
1201                 case -ENOENT:
1202                         printk( "%s%s: No such file or directory\n", name_resume, specialfile);
1203                         break;
1204                 case -ENOMEM:
1205                         printk( "%sNot enough memory\n", name_resume);
1206                         break;
1207                 default:
1208                         printk( "%sError %d resuming\n", name_resume, error );
1209         }
1210         MDELAY(1000);
1211         return error;
1212 }
1213
1214 /*
1215  * Called from init kernel_thread.
1216  * We check if we have an image and if so we try to resume
1217  */
1218
1219 void software_resume(void)
1220 {
1221         if (num_online_cpus() > 1) {
1222                 printk(KERN_WARNING "Software Suspend has malfunctioning SMP support. Disabled :(\n");  
1223                 return;
1224         }
1225         /* We enable the possibility of machine suspend */
1226         software_suspend_enabled = 1;
1227         if (!resume_status)
1228                 return;
1229
1230         printk( "%s", name_resume );
1231         if (resume_status == NORESUME) {
1232                 if(resume_file[0])
1233                         read_suspend_image(resume_file, 1);
1234                 printk( "disabled\n" );
1235                 return;
1236         }
1237         MDELAY(1000);
1238
1239         orig_loglevel = console_loglevel;
1240         console_loglevel = new_loglevel;
1241
1242         if (!resume_file[0] && resume_status == RESUME_SPECIFIED) {
1243                 printk( "suspension device unspecified\n" );
1244                 return;
1245         }
1246
1247         printk( "resuming from %s\n", resume_file);
1248         if (read_suspend_image(resume_file, 0))
1249                 goto read_failure;
1250         do_magic(1);
1251         panic("This never returns");
1252
1253 read_failure:
1254         console_loglevel = orig_loglevel;
1255         return;
1256 }
1257
1258 static int __init resume_setup(char *str)
1259 {
1260         if (resume_status == NORESUME)
1261                 return 1;
1262
1263         strncpy( resume_file, str, 255 );
1264         resume_status = RESUME_SPECIFIED;
1265
1266         return 1;
1267 }
1268
1269 static int __init noresume_setup(char *str)
1270 {
1271         resume_status = NORESUME;
1272         return 1;
1273 }
1274
1275 __setup("noresume", noresume_setup);
1276 __setup("resume=", resume_setup);
1277
1278 EXPORT_SYMBOL(software_suspend);
1279 EXPORT_SYMBOL(software_suspend_enabled);
1280 EXPORT_SYMBOL(refrigerator);