v2.5.0.4 -> v2.5.0.5
[linux-flexiantxendom0-3.2.10.git] / kernel / exit.c
1 /*
2  *  linux/kernel/exit.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6
7 #include <linux/config.h>
8 #include <linux/slab.h>
9 #include <linux/interrupt.h>
10 #include <linux/smp_lock.h>
11 #include <linux/module.h>
12 #include <linux/completion.h>
13 #include <linux/personality.h>
14 #include <linux/tty.h>
15 #ifdef CONFIG_BSD_PROCESS_ACCT
16 #include <linux/acct.h>
17 #endif
18
19 #include <asm/uaccess.h>
20 #include <asm/pgtable.h>
21 #include <asm/mmu_context.h>
22
23 extern void sem_exit (void);
24 extern struct task_struct *child_reaper;
25
26 int getrusage(struct task_struct *, int, struct rusage *);
27
28 static void release_task(struct task_struct * p)
29 {
30         if (p != current) {
31 #ifdef CONFIG_SMP
32                 /*
33                  * Wait to make sure the process isn't on the
34                  * runqueue (active on some other CPU still)
35                  */
36                 for (;;) {
37                         task_lock(p);
38                         if (!task_has_cpu(p))
39                                 break;
40                         task_unlock(p);
41                         do {
42                                 cpu_relax();
43                                 barrier();
44                         } while (task_has_cpu(p));
45                 }
46                 task_unlock(p);
47 #endif
48                 atomic_dec(&p->user->processes);
49                 free_uid(p->user);
50                 unhash_process(p);
51
52                 release_thread(p);
53                 current->cmin_flt += p->min_flt + p->cmin_flt;
54                 current->cmaj_flt += p->maj_flt + p->cmaj_flt;
55                 current->cnswap += p->nswap + p->cnswap;
56                 /*
57                  * Potentially available timeslices are retrieved
58                  * here - this way the parent does not get penalized
59                  * for creating too many processes.
60                  *
61                  * (this cannot be used to artificially 'generate'
62                  * timeslices, because any timeslice recovered here
63                  * was given away by the parent in the first place.)
64                  */
65                 current->counter += p->counter;
66                 if (current->counter >= MAX_COUNTER)
67                         current->counter = MAX_COUNTER;
68                 p->pid = 0;
69                 free_task_struct(p);
70         } else {
71                 printk("task releasing itself\n");
72         }
73 }
74
75 /*
76  * This checks not only the pgrp, but falls back on the pid if no
77  * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
78  * without this...
79  */
80 int session_of_pgrp(int pgrp)
81 {
82         struct task_struct *p;
83         int fallback;
84
85         fallback = -1;
86         read_lock(&tasklist_lock);
87         for_each_task(p) {
88                 if (p->session <= 0)
89                         continue;
90                 if (p->pgrp == pgrp) {
91                         fallback = p->session;
92                         break;
93                 }
94                 if (p->pid == pgrp)
95                         fallback = p->session;
96         }
97         read_unlock(&tasklist_lock);
98         return fallback;
99 }
100
101 /*
102  * Determine if a process group is "orphaned", according to the POSIX
103  * definition in 2.2.2.52.  Orphaned process groups are not to be affected
104  * by terminal-generated stop signals.  Newly orphaned process groups are
105  * to receive a SIGHUP and a SIGCONT.
106  *
107  * "I ask you, have you ever known what it is to be an orphan?"
108  */
109 static int will_become_orphaned_pgrp(int pgrp, struct task_struct * ignored_task)
110 {
111         struct task_struct *p;
112
113         read_lock(&tasklist_lock);
114         for_each_task(p) {
115                 if ((p == ignored_task) || (p->pgrp != pgrp) ||
116                     (p->state == TASK_ZOMBIE) ||
117                     (p->p_pptr->pid == 1))
118                         continue;
119                 if ((p->p_pptr->pgrp != pgrp) &&
120                     (p->p_pptr->session == p->session)) {
121                         read_unlock(&tasklist_lock);
122                         return 0;
123                 }
124         }
125         read_unlock(&tasklist_lock);
126         return 1;       /* (sighing) "Often!" */
127 }
128
129 int is_orphaned_pgrp(int pgrp)
130 {
131         return will_become_orphaned_pgrp(pgrp, 0);
132 }
133
134 static inline int has_stopped_jobs(int pgrp)
135 {
136         int retval = 0;
137         struct task_struct * p;
138
139         read_lock(&tasklist_lock);
140         for_each_task(p) {
141                 if (p->pgrp != pgrp)
142                         continue;
143                 if (p->state != TASK_STOPPED)
144                         continue;
145                 retval = 1;
146                 break;
147         }
148         read_unlock(&tasklist_lock);
149         return retval;
150 }
151
152 /*
153  * When we die, we re-parent all our children.
154  * Try to give them to another thread in our process
155  * group, and if no such member exists, give it to
156  * the global child reaper process (ie "init")
157  */
158 static inline void forget_original_parent(struct task_struct * father)
159 {
160         struct task_struct * p, *reaper;
161
162         read_lock(&tasklist_lock);
163
164         /* Next in our thread group */
165         reaper = next_thread(father);
166         if (reaper == father)
167                 reaper = child_reaper;
168
169         for_each_task(p) {
170                 if (p->p_opptr == father) {
171                         /* We dont want people slaying init */
172                         p->exit_signal = SIGCHLD;
173                         p->self_exec_id++;
174
175                         /* Make sure we're not reparenting to ourselves */
176                         if (p == reaper)
177                                 p->p_opptr = child_reaper;
178                         else
179                                 p->p_opptr = reaper;
180
181                         if (p->pdeath_signal) send_sig(p->pdeath_signal, p, 0);
182                 }
183         }
184         read_unlock(&tasklist_lock);
185 }
186
187 static inline void close_files(struct files_struct * files)
188 {
189         int i, j;
190
191         j = 0;
192         for (;;) {
193                 unsigned long set;
194                 i = j * __NFDBITS;
195                 if (i >= files->max_fdset || i >= files->max_fds)
196                         break;
197                 set = files->open_fds->fds_bits[j++];
198                 while (set) {
199                         if (set & 1) {
200                                 struct file * file = xchg(&files->fd[i], NULL);
201                                 if (file)
202                                         filp_close(file, files);
203                         }
204                         i++;
205                         set >>= 1;
206                 }
207         }
208 }
209
210 void put_files_struct(struct files_struct *files)
211 {
212         if (atomic_dec_and_test(&files->count)) {
213                 close_files(files);
214                 /*
215                  * Free the fd and fdset arrays if we expanded them.
216                  */
217                 if (files->fd != &files->fd_array[0])
218                         free_fd_array(files->fd, files->max_fds);
219                 if (files->max_fdset > __FD_SETSIZE) {
220                         free_fdset(files->open_fds, files->max_fdset);
221                         free_fdset(files->close_on_exec, files->max_fdset);
222                 }
223                 kmem_cache_free(files_cachep, files);
224         }
225 }
226
227 static inline void __exit_files(struct task_struct *tsk)
228 {
229         struct files_struct * files = tsk->files;
230
231         if (files) {
232                 task_lock(tsk);
233                 tsk->files = NULL;
234                 task_unlock(tsk);
235                 put_files_struct(files);
236         }
237 }
238
239 void exit_files(struct task_struct *tsk)
240 {
241         __exit_files(tsk);
242 }
243
244 static inline void __put_fs_struct(struct fs_struct *fs)
245 {
246         /* No need to hold fs->lock if we are killing it */
247         if (atomic_dec_and_test(&fs->count)) {
248                 dput(fs->root);
249                 mntput(fs->rootmnt);
250                 dput(fs->pwd);
251                 mntput(fs->pwdmnt);
252                 if (fs->altroot) {
253                         dput(fs->altroot);
254                         mntput(fs->altrootmnt);
255                 }
256                 kmem_cache_free(fs_cachep, fs);
257         }
258 }
259
260 void put_fs_struct(struct fs_struct *fs)
261 {
262         __put_fs_struct(fs);
263 }
264
265 static inline void __exit_fs(struct task_struct *tsk)
266 {
267         struct fs_struct * fs = tsk->fs;
268
269         if (fs) {
270                 task_lock(tsk);
271                 tsk->fs = NULL;
272                 task_unlock(tsk);
273                 __put_fs_struct(fs);
274         }
275 }
276
277 void exit_fs(struct task_struct *tsk)
278 {
279         __exit_fs(tsk);
280 }
281
282 /*
283  * We can use these to temporarily drop into
284  * "lazy TLB" mode and back.
285  */
286 struct mm_struct * start_lazy_tlb(void)
287 {
288         struct mm_struct *mm = current->mm;
289         current->mm = NULL;
290         /* active_mm is still 'mm' */
291         atomic_inc(&mm->mm_count);
292         enter_lazy_tlb(mm, current, smp_processor_id());
293         return mm;
294 }
295
296 void end_lazy_tlb(struct mm_struct *mm)
297 {
298         struct mm_struct *active_mm = current->active_mm;
299
300         current->mm = mm;
301         if (mm != active_mm) {
302                 current->active_mm = mm;
303                 activate_mm(active_mm, mm);
304         }
305         mmdrop(active_mm);
306 }
307
308 /*
309  * Turn us into a lazy TLB process if we
310  * aren't already..
311  */
312 static inline void __exit_mm(struct task_struct * tsk)
313 {
314         struct mm_struct * mm = tsk->mm;
315
316         mm_release();
317         if (mm) {
318                 atomic_inc(&mm->mm_count);
319                 if (mm != tsk->active_mm) BUG();
320                 /* more a memory barrier than a real lock */
321                 task_lock(tsk);
322                 tsk->mm = NULL;
323                 task_unlock(tsk);
324                 enter_lazy_tlb(mm, current, smp_processor_id());
325                 mmput(mm);
326         }
327 }
328
329 void exit_mm(struct task_struct *tsk)
330 {
331         __exit_mm(tsk);
332 }
333
334 /*
335  * Send signals to all our closest relatives so that they know
336  * to properly mourn us..
337  */
338 static void exit_notify(void)
339 {
340         struct task_struct * p, *t;
341
342         forget_original_parent(current);
343         /*
344          * Check to see if any process groups have become orphaned
345          * as a result of our exiting, and if they have any stopped
346          * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
347          *
348          * Case i: Our father is in a different pgrp than we are
349          * and we were the only connection outside, so our pgrp
350          * is about to become orphaned.
351          */
352          
353         t = current->p_pptr;
354         
355         if ((t->pgrp != current->pgrp) &&
356             (t->session == current->session) &&
357             will_become_orphaned_pgrp(current->pgrp, current) &&
358             has_stopped_jobs(current->pgrp)) {
359                 kill_pg(current->pgrp,SIGHUP,1);
360                 kill_pg(current->pgrp,SIGCONT,1);
361         }
362
363         /* Let father know we died 
364          *
365          * Thread signals are configurable, but you aren't going to use
366          * that to send signals to arbitary processes. 
367          * That stops right now.
368          *
369          * If the parent exec id doesn't match the exec id we saved
370          * when we started then we know the parent has changed security
371          * domain.
372          *
373          * If our self_exec id doesn't match our parent_exec_id then
374          * we have changed execution domain as these two values started
375          * the same after a fork.
376          *      
377          */
378         
379         if(current->exit_signal != SIGCHLD &&
380             ( current->parent_exec_id != t->self_exec_id  ||
381               current->self_exec_id != current->parent_exec_id) 
382             && !capable(CAP_KILL))
383                 current->exit_signal = SIGCHLD;
384
385
386         /*
387          * This loop does two things:
388          *
389          * A.  Make init inherit all the child processes
390          * B.  Check to see if any process groups have become orphaned
391          *      as a result of our exiting, and if they have any stopped
392          *      jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
393          */
394
395         write_lock_irq(&tasklist_lock);
396         current->state = TASK_ZOMBIE;
397         do_notify_parent(current, current->exit_signal);
398         while (current->p_cptr != NULL) {
399                 p = current->p_cptr;
400                 current->p_cptr = p->p_osptr;
401                 p->p_ysptr = NULL;
402                 p->ptrace = 0;
403
404                 p->p_pptr = p->p_opptr;
405                 p->p_osptr = p->p_pptr->p_cptr;
406                 if (p->p_osptr)
407                         p->p_osptr->p_ysptr = p;
408                 p->p_pptr->p_cptr = p;
409                 if (p->state == TASK_ZOMBIE)
410                         do_notify_parent(p, p->exit_signal);
411                 /*
412                  * process group orphan check
413                  * Case ii: Our child is in a different pgrp
414                  * than we are, and it was the only connection
415                  * outside, so the child pgrp is now orphaned.
416                  */
417                 if ((p->pgrp != current->pgrp) &&
418                     (p->session == current->session)) {
419                         int pgrp = p->pgrp;
420
421                         write_unlock_irq(&tasklist_lock);
422                         if (is_orphaned_pgrp(pgrp) && has_stopped_jobs(pgrp)) {
423                                 kill_pg(pgrp,SIGHUP,1);
424                                 kill_pg(pgrp,SIGCONT,1);
425                         }
426                         write_lock_irq(&tasklist_lock);
427                 }
428         }
429         write_unlock_irq(&tasklist_lock);
430 }
431
432 NORET_TYPE void do_exit(long code)
433 {
434         struct task_struct *tsk = current;
435
436         if (in_interrupt())
437                 panic("Aiee, killing interrupt handler!");
438         if (!tsk->pid)
439                 panic("Attempted to kill the idle task!");
440         if (tsk->pid == 1)
441                 panic("Attempted to kill init!");
442         tsk->flags |= PF_EXITING;
443         del_timer_sync(&tsk->real_timer);
444
445 fake_volatile:
446 #ifdef CONFIG_BSD_PROCESS_ACCT
447         acct_process(code);
448 #endif
449         __exit_mm(tsk);
450
451         lock_kernel();
452         sem_exit();
453         __exit_files(tsk);
454         __exit_fs(tsk);
455         exit_sighand(tsk);
456         exit_thread();
457
458         if (current->leader)
459                 disassociate_ctty(1);
460
461         put_exec_domain(tsk->exec_domain);
462         if (tsk->binfmt && tsk->binfmt->module)
463                 __MOD_DEC_USE_COUNT(tsk->binfmt->module);
464
465         tsk->exit_code = code;
466         exit_notify();
467         schedule();
468         BUG();
469 /*
470  * In order to get rid of the "volatile function does return" message
471  * I did this little loop that confuses gcc to think do_exit really
472  * is volatile. In fact it's schedule() that is volatile in some
473  * circumstances: when current->state = ZOMBIE, schedule() never
474  * returns.
475  *
476  * In fact the natural way to do all this is to have the label and the
477  * goto right after each other, but I put the fake_volatile label at
478  * the start of the function just in case something /really/ bad
479  * happens, and the schedule returns. This way we can try again. I'm
480  * not paranoid: it's just that everybody is out to get me.
481  */
482         goto fake_volatile;
483 }
484
485 NORET_TYPE void complete_and_exit(struct completion *comp, long code)
486 {
487         if (comp)
488                 complete(comp);
489         
490         do_exit(code);
491 }
492
493 asmlinkage long sys_exit(int error_code)
494 {
495         do_exit((error_code&0xff)<<8);
496 }
497
498 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru)
499 {
500         int flag, retval;
501         DECLARE_WAITQUEUE(wait, current);
502         struct task_struct *tsk;
503
504         if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL))
505                 return -EINVAL;
506
507         add_wait_queue(&current->wait_chldexit,&wait);
508 repeat:
509         flag = 0;
510         current->state = TASK_INTERRUPTIBLE;
511         read_lock(&tasklist_lock);
512         tsk = current;
513         do {
514                 struct task_struct *p;
515                 for (p = tsk->p_cptr ; p ; p = p->p_osptr) {
516                         if (pid>0) {
517                                 if (p->pid != pid)
518                                         continue;
519                         } else if (!pid) {
520                                 if (p->pgrp != current->pgrp)
521                                         continue;
522                         } else if (pid != -1) {
523                                 if (p->pgrp != -pid)
524                                         continue;
525                         }
526                         /* Wait for all children (clone and not) if __WALL is set;
527                          * otherwise, wait for clone children *only* if __WCLONE is
528                          * set; otherwise, wait for non-clone children *only*.  (Note:
529                          * A "clone" child here is one that reports to its parent
530                          * using a signal other than SIGCHLD.) */
531                         if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
532                             && !(options & __WALL))
533                                 continue;
534                         flag = 1;
535                         switch (p->state) {
536                         case TASK_STOPPED:
537                                 if (!p->exit_code)
538                                         continue;
539                                 if (!(options & WUNTRACED) && !(p->ptrace & PT_PTRACED))
540                                         continue;
541                                 read_unlock(&tasklist_lock);
542                                 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 
543                                 if (!retval && stat_addr) 
544                                         retval = put_user((p->exit_code << 8) | 0x7f, stat_addr);
545                                 if (!retval) {
546                                         p->exit_code = 0;
547                                         retval = p->pid;
548                                 }
549                                 goto end_wait4;
550                         case TASK_ZOMBIE:
551                                 current->times.tms_cutime += p->times.tms_utime + p->times.tms_cutime;
552                                 current->times.tms_cstime += p->times.tms_stime + p->times.tms_cstime;
553                                 read_unlock(&tasklist_lock);
554                                 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
555                                 if (!retval && stat_addr)
556                                         retval = put_user(p->exit_code, stat_addr);
557                                 if (retval)
558                                         goto end_wait4; 
559                                 retval = p->pid;
560                                 if (p->p_opptr != p->p_pptr) {
561                                         write_lock_irq(&tasklist_lock);
562                                         REMOVE_LINKS(p);
563                                         p->p_pptr = p->p_opptr;
564                                         SET_LINKS(p);
565                                         do_notify_parent(p, SIGCHLD);
566                                         write_unlock_irq(&tasklist_lock);
567                                 } else
568                                         release_task(p);
569                                 goto end_wait4;
570                         default:
571                                 continue;
572                         }
573                 }
574                 if (options & __WNOTHREAD)
575                         break;
576                 tsk = next_thread(tsk);
577         } while (tsk != current);
578         read_unlock(&tasklist_lock);
579         if (flag) {
580                 retval = 0;
581                 if (options & WNOHANG)
582                         goto end_wait4;
583                 retval = -ERESTARTSYS;
584                 if (signal_pending(current))
585                         goto end_wait4;
586                 schedule();
587                 goto repeat;
588         }
589         retval = -ECHILD;
590 end_wait4:
591         current->state = TASK_RUNNING;
592         remove_wait_queue(&current->wait_chldexit,&wait);
593         return retval;
594 }
595
596 #if !defined(__alpha__) && !defined(__ia64__)
597
598 /*
599  * sys_waitpid() remains for compatibility. waitpid() should be
600  * implemented by calling sys_wait4() from libc.a.
601  */
602 asmlinkage long sys_waitpid(pid_t pid,unsigned int * stat_addr, int options)
603 {
604         return sys_wait4(pid, stat_addr, options, NULL);
605 }
606
607 #endif