a5014346ee68b5570b72158f855d4a0dc669496f
[linux-flexiantxendom0-3.2.10.git] / fs / jfs / jfs_logmgr.c
1 /*
2  *   Copyright (c) International Business Machines Corp., 2000-2003
3  *   Portions Copyright (c) Christoph Hellwig, 2001-2002
4  *
5  *   This program is free software;  you can redistribute it and/or modify
6  *   it under the terms of the GNU General Public License as published by
7  *   the Free Software Foundation; either version 2 of the License, or 
8  *   (at your option) any later version.
9  * 
10  *   This program is distributed in the hope that it will be useful,
11  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
13  *   the GNU General Public License for more details.
14  *
15  *   You should have received a copy of the GNU General Public License
16  *   along with this program;  if not, write to the Free Software 
17  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  */
19
20 /*
21  *      jfs_logmgr.c: log manager
22  *
23  * for related information, see transaction manager (jfs_txnmgr.c), and
24  * recovery manager (jfs_logredo.c).
25  *
26  * note: for detail, RTFS.
27  *
28  *      log buffer manager:
29  * special purpose buffer manager supporting log i/o requirements.
30  * per log serial pageout of logpage
31  * queuing i/o requests and redrive i/o at iodone
32  * maintain current logpage buffer
33  * no caching since append only
34  * appropriate jfs buffer cache buffers as needed
35  *
36  *      group commit:
37  * transactions which wrote COMMIT records in the same in-memory
38  * log page during the pageout of previous/current log page(s) are
39  * committed together by the pageout of the page.
40  *
41  *      TBD lazy commit:
42  * transactions are committed asynchronously when the log page
43  * containing it COMMIT is paged out when it becomes full;
44  *
45  *      serialization:
46  * . a per log lock serialize log write.
47  * . a per log lock serialize group commit.
48  * . a per log lock serialize log open/close;
49  *
50  *      TBD log integrity:
51  * careful-write (ping-pong) of last logpage to recover from crash
52  * in overwrite.
53  * detection of split (out-of-order) write of physical sectors
54  * of last logpage via timestamp at end of each sector
55  * with its mirror data array at trailer).
56  *
57  *      alternatives:
58  * lsn - 64-bit monotonically increasing integer vs
59  * 32-bit lspn and page eor.
60  */
61
62 #include <linux/fs.h>
63 #include <linux/blkdev.h>
64 #include <linux/interrupt.h>
65 #include <linux/smp_lock.h>
66 #include <linux/completion.h>
67 #include <linux/buffer_head.h>          /* for sync_blockdev() */
68 #include <linux/bio.h>
69 #include <linux/suspend.h>
70 #include "jfs_incore.h"
71 #include "jfs_filsys.h"
72 #include "jfs_metapage.h"
73 #include "jfs_txnmgr.h"
74 #include "jfs_debug.h"
75
76
77 /*
78  * lbuf's ready to be redriven.  Protected by log_redrive_lock (jfsIO thread)
79  */
80 static struct lbuf *log_redrive_list;
81 static spinlock_t log_redrive_lock = SPIN_LOCK_UNLOCKED;
82 DECLARE_WAIT_QUEUE_HEAD(jfs_IO_thread_wait);
83
84
85 /*
86  *      log read/write serialization (per log)
87  */
88 #define LOG_LOCK_INIT(log)      init_MUTEX(&(log)->loglock)
89 #define LOG_LOCK(log)           down(&((log)->loglock))
90 #define LOG_UNLOCK(log)         up(&((log)->loglock))
91
92
93 /*
94  *      log group commit serialization (per log)
95  */
96
97 #define LOGGC_LOCK_INIT(log)    spin_lock_init(&(log)->gclock)
98 #define LOGGC_LOCK(log)         spin_lock_irq(&(log)->gclock)
99 #define LOGGC_UNLOCK(log)       spin_unlock_irq(&(log)->gclock)
100 #define LOGGC_WAKEUP(tblk)      wake_up_all(&(tblk)->gcwait)
101
102 /*
103  *      log sync serialization (per log)
104  */
105 #define LOGSYNC_DELTA(logsize)          min((logsize)/8, 128*LOGPSIZE)
106 #define LOGSYNC_BARRIER(logsize)        ((logsize)/4)
107 /*
108 #define LOGSYNC_DELTA(logsize)          min((logsize)/4, 256*LOGPSIZE)
109 #define LOGSYNC_BARRIER(logsize)        ((logsize)/2)
110 */
111
112
113 /*
114  *      log buffer cache synchronization
115  */
116 static spinlock_t jfsLCacheLock = SPIN_LOCK_UNLOCKED;
117
118 #define LCACHE_LOCK(flags)      spin_lock_irqsave(&jfsLCacheLock, flags)
119 #define LCACHE_UNLOCK(flags)    spin_unlock_irqrestore(&jfsLCacheLock, flags)
120
121 /*
122  * See __SLEEP_COND in jfs_locks.h
123  */
124 #define LCACHE_SLEEP_COND(wq, cond, flags)      \
125 do {                                            \
126         if (cond)                               \
127                 break;                          \
128         __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \
129 } while (0)
130
131 #define LCACHE_WAKEUP(event)    wake_up(event)
132
133
134 /*
135  *      lbuf buffer cache (lCache) control
136  */
137 /* log buffer manager pageout control (cumulative, inclusive) */
138 #define lbmREAD         0x0001
139 #define lbmWRITE        0x0002  /* enqueue at tail of write queue;
140                                  * init pageout if at head of queue;
141                                  */
142 #define lbmRELEASE      0x0004  /* remove from write queue
143                                  * at completion of pageout;
144                                  * do not free/recycle it yet:
145                                  * caller will free it;
146                                  */
147 #define lbmSYNC         0x0008  /* do not return to freelist
148                                  * when removed from write queue;
149                                  */
150 #define lbmFREE         0x0010  /* return to freelist
151                                  * at completion of pageout;
152                                  * the buffer may be recycled;
153                                  */
154 #define lbmDONE         0x0020
155 #define lbmERROR        0x0040
156 #define lbmGC           0x0080  /* lbmIODone to perform post-GC processing
157                                  * of log page
158                                  */
159 #define lbmDIRECT       0x0100
160
161 /*
162  * external references
163  */
164 extern void txLazyUnlock(struct tblock * tblk);
165 extern int jfs_stop_threads;
166 extern struct completion jfsIOwait;
167
168 /*
169  * forward references
170  */
171 static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk,
172                          struct lrd * lrd, struct tlock * tlck);
173
174 static int lmNextPage(struct jfs_log * log);
175 static int lmLogFileSystem(struct jfs_log * log, char *uuid, int activate);
176
177 static int lbmLogInit(struct jfs_log * log);
178 static void lbmLogShutdown(struct jfs_log * log);
179 static struct lbuf *lbmAllocate(struct jfs_log * log, int);
180 static void lbmFree(struct lbuf * bp);
181 static void lbmfree(struct lbuf * bp);
182 static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp);
183 static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block);
184 static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag);
185 static int lbmIOWait(struct lbuf * bp, int flag);
186 static bio_end_io_t lbmIODone;
187 static void lbmStartIO(struct lbuf * bp);
188 static void lmGCwrite(struct jfs_log * log, int cant_block);
189
190
191
192 /*
193  *      statistics
194  */
195 #ifdef CONFIG_JFS_STATISTICS
196 struct lmStat {
197         uint commit;            /* # of commit */
198         uint pagedone;          /* # of page written */
199         uint submitted;         /* # of pages submitted */
200         uint full_page;         /* # of full pages submitted */
201         uint partial_page;      /* # of partial pages submitted */
202 } lmStat;
203 #endif
204
205
206 /*
207  * NAME:        lmLog()
208  *
209  * FUNCTION:    write a log record;
210  *
211  * PARAMETER:
212  *
213  * RETURN:      lsn - offset to the next log record to write (end-of-log);
214  *              -1  - error;
215  *
216  * note: todo: log error handler
217  */
218 int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
219           struct tlock * tlck)
220 {
221         int lsn;
222         int diffp, difft;
223         struct metapage *mp = NULL;
224
225         jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p",
226                  log, tblk, lrd, tlck);
227
228         LOG_LOCK(log);
229
230         /* log by (out-of-transaction) JFS ? */
231         if (tblk == NULL)
232                 goto writeRecord;
233
234         /* log from page ? */
235         if (tlck == NULL ||
236             tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL)
237                 goto writeRecord;
238
239         /*
240          *      initialize/update page/transaction recovery lsn
241          */
242         lsn = log->lsn;
243
244         LOGSYNC_LOCK(log);
245
246         /*
247          * initialize page lsn if first log write of the page
248          */
249         if (mp->lsn == 0) {
250                 mp->log = log;
251                 mp->lsn = lsn;
252                 log->count++;
253
254                 /* insert page at tail of logsynclist */
255                 list_add_tail(&mp->synclist, &log->synclist);
256         }
257
258         /*
259          *      initialize/update lsn of tblock of the page
260          *
261          * transaction inherits oldest lsn of pages associated
262          * with allocation/deallocation of resources (their
263          * log records are used to reconstruct allocation map
264          * at recovery time: inode for inode allocation map,
265          * B+-tree index of extent descriptors for block
266          * allocation map);
267          * allocation map pages inherit transaction lsn at
268          * commit time to allow forwarding log syncpt past log
269          * records associated with allocation/deallocation of
270          * resources only after persistent map of these map pages
271          * have been updated and propagated to home.
272          */
273         /*
274          * initialize transaction lsn:
275          */
276         if (tblk->lsn == 0) {
277                 /* inherit lsn of its first page logged */
278                 tblk->lsn = mp->lsn;
279                 log->count++;
280
281                 /* insert tblock after the page on logsynclist */
282                 list_add(&tblk->synclist, &mp->synclist);
283         }
284         /*
285          * update transaction lsn:
286          */
287         else {
288                 /* inherit oldest/smallest lsn of page */
289                 logdiff(diffp, mp->lsn, log);
290                 logdiff(difft, tblk->lsn, log);
291                 if (diffp < difft) {
292                         /* update tblock lsn with page lsn */
293                         tblk->lsn = mp->lsn;
294
295                         /* move tblock after page on logsynclist */
296                         list_move(&tblk->synclist, &mp->synclist);
297                 }
298         }
299
300         LOGSYNC_UNLOCK(log);
301
302         /*
303          *      write the log record
304          */
305       writeRecord:
306         lsn = lmWriteRecord(log, tblk, lrd, tlck);
307
308         /*
309          * forward log syncpt if log reached next syncpt trigger
310          */
311         logdiff(diffp, lsn, log);
312         if (diffp >= log->nextsync)
313                 lsn = lmLogSync(log, 0);
314
315         /* update end-of-log lsn */
316         log->lsn = lsn;
317
318         LOG_UNLOCK(log);
319
320         /* return end-of-log address */
321         return lsn;
322 }
323
324
325 /*
326  * NAME:        lmWriteRecord()
327  *
328  * FUNCTION:    move the log record to current log page
329  *
330  * PARAMETER:   cd      - commit descriptor
331  *
332  * RETURN:      end-of-log address
333  *                      
334  * serialization: LOG_LOCK() held on entry/exit
335  */
336 static int
337 lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
338               struct tlock * tlck)
339 {
340         int lsn = 0;            /* end-of-log address */
341         struct lbuf *bp;        /* dst log page buffer */
342         struct logpage *lp;     /* dst log page */
343         caddr_t dst;            /* destination address in log page */
344         int dstoffset;          /* end-of-log offset in log page */
345         int freespace;          /* free space in log page */
346         caddr_t p;              /* src meta-data page */
347         caddr_t src;
348         int srclen;
349         int nbytes;             /* number of bytes to move */
350         int i;
351         int len;
352         struct linelock *linelock;
353         struct lv *lv;
354         struct lvd *lvd;
355         int l2linesize;
356
357         len = 0;
358
359         /* retrieve destination log page to write */
360         bp = (struct lbuf *) log->bp;
361         lp = (struct logpage *) bp->l_ldata;
362         dstoffset = log->eor;
363
364         /* any log data to write ? */
365         if (tlck == NULL)
366                 goto moveLrd;
367
368         /*
369          *      move log record data
370          */
371         /* retrieve source meta-data page to log */
372         if (tlck->flag & tlckPAGELOCK) {
373                 p = (caddr_t) (tlck->mp->data);
374                 linelock = (struct linelock *) & tlck->lock;
375         }
376         /* retrieve source in-memory inode to log */
377         else if (tlck->flag & tlckINODELOCK) {
378                 if (tlck->type & tlckDTREE)
379                         p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot;
380                 else
381                         p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot;
382                 linelock = (struct linelock *) & tlck->lock;
383         }
384 #ifdef  _JFS_WIP
385         else if (tlck->flag & tlckINLINELOCK) {
386
387                 inlinelock = (struct inlinelock *) & tlck;
388                 p = (caddr_t) & inlinelock->pxd;
389                 linelock = (struct linelock *) & tlck;
390         }
391 #endif                          /* _JFS_WIP */
392         else {
393                 jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck);
394                 return 0;       /* Probably should trap */
395         }
396         l2linesize = linelock->l2linesize;
397
398       moveData:
399         ASSERT(linelock->index <= linelock->maxcnt);
400
401         lv = linelock->lv;
402         for (i = 0; i < linelock->index; i++, lv++) {
403                 if (lv->length == 0)
404                         continue;
405
406                 /* is page full ? */
407                 if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) {
408                         /* page become full: move on to next page */
409                         lmNextPage(log);
410
411                         bp = log->bp;
412                         lp = (struct logpage *) bp->l_ldata;
413                         dstoffset = LOGPHDRSIZE;
414                 }
415
416                 /*
417                  * move log vector data
418                  */
419                 src = (u8 *) p + (lv->offset << l2linesize);
420                 srclen = lv->length << l2linesize;
421                 len += srclen;
422                 while (srclen > 0) {
423                         freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
424                         nbytes = min(freespace, srclen);
425                         dst = (caddr_t) lp + dstoffset;
426                         memcpy(dst, src, nbytes);
427                         dstoffset += nbytes;
428
429                         /* is page not full ? */
430                         if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
431                                 break;
432
433                         /* page become full: move on to next page */
434                         lmNextPage(log);
435
436                         bp = (struct lbuf *) log->bp;
437                         lp = (struct logpage *) bp->l_ldata;
438                         dstoffset = LOGPHDRSIZE;
439
440                         srclen -= nbytes;
441                         src += nbytes;
442                 }
443
444                 /*
445                  * move log vector descriptor
446                  */
447                 len += 4;
448                 lvd = (struct lvd *) ((caddr_t) lp + dstoffset);
449                 lvd->offset = cpu_to_le16(lv->offset);
450                 lvd->length = cpu_to_le16(lv->length);
451                 dstoffset += 4;
452                 jfs_info("lmWriteRecord: lv offset:%d length:%d",
453                          lv->offset, lv->length);
454         }
455
456         if ((i = linelock->next)) {
457                 linelock = (struct linelock *) lid_to_tlock(i);
458                 goto moveData;
459         }
460
461         /*
462          *      move log record descriptor
463          */
464       moveLrd:
465         lrd->length = cpu_to_le16(len);
466
467         src = (caddr_t) lrd;
468         srclen = LOGRDSIZE;
469
470         while (srclen > 0) {
471                 freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset;
472                 nbytes = min(freespace, srclen);
473                 dst = (caddr_t) lp + dstoffset;
474                 memcpy(dst, src, nbytes);
475
476                 dstoffset += nbytes;
477                 srclen -= nbytes;
478
479                 /* are there more to move than freespace of page ? */
480                 if (srclen)
481                         goto pageFull;
482
483                 /*
484                  * end of log record descriptor
485                  */
486
487                 /* update last log record eor */
488                 log->eor = dstoffset;
489                 bp->l_eor = dstoffset;
490                 lsn = (log->page << L2LOGPSIZE) + dstoffset;
491
492                 if (lrd->type & cpu_to_le16(LOG_COMMIT)) {
493                         tblk->clsn = lsn;
494                         jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn,
495                                  bp->l_eor);
496
497                         INCREMENT(lmStat.commit);       /* # of commit */
498
499                         /*
500                          * enqueue tblock for group commit:
501                          *
502                          * enqueue tblock of non-trivial/synchronous COMMIT
503                          * at tail of group commit queue
504                          * (trivial/asynchronous COMMITs are ignored by
505                          * group commit.)
506                          */
507                         LOGGC_LOCK(log);
508
509                         /* init tblock gc state */
510                         tblk->flag = tblkGC_QUEUE;
511                         tblk->bp = log->bp;
512                         tblk->pn = log->page;
513                         tblk->eor = log->eor;
514
515                         /* enqueue transaction to commit queue */
516                         tblk->cqnext = NULL;
517                         if (log->cqueue.head) {
518                                 log->cqueue.tail->cqnext = tblk;
519                                 log->cqueue.tail = tblk;
520                         } else
521                                 log->cqueue.head = log->cqueue.tail = tblk;
522
523                         LOGGC_UNLOCK(log);
524                 }
525
526                 jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x",
527                         le16_to_cpu(lrd->type), log->bp, log->page, dstoffset);
528
529                 /* page not full ? */
530                 if (dstoffset < LOGPSIZE - LOGPTLRSIZE)
531                         return lsn;
532
533               pageFull:
534                 /* page become full: move on to next page */
535                 lmNextPage(log);
536
537                 bp = (struct lbuf *) log->bp;
538                 lp = (struct logpage *) bp->l_ldata;
539                 dstoffset = LOGPHDRSIZE;
540                 src += nbytes;
541         }
542
543         return lsn;
544 }
545
546
547 /*
548  * NAME:        lmNextPage()
549  *
550  * FUNCTION:    write current page and allocate next page.
551  *
552  * PARAMETER:   log
553  *
554  * RETURN:      0
555  *                      
556  * serialization: LOG_LOCK() held on entry/exit
557  */
558 static int lmNextPage(struct jfs_log * log)
559 {
560         struct logpage *lp;
561         int lspn;               /* log sequence page number */
562         int pn;                 /* current page number */
563         struct lbuf *bp;
564         struct lbuf *nextbp;
565         struct tblock *tblk;
566
567         /* get current log page number and log sequence page number */
568         pn = log->page;
569         bp = log->bp;
570         lp = (struct logpage *) bp->l_ldata;
571         lspn = le32_to_cpu(lp->h.page);
572
573         LOGGC_LOCK(log);
574
575         /*
576          *      write or queue the full page at the tail of write queue
577          */
578         /* get the tail tblk on commit queue */
579         tblk = log->cqueue.tail;
580
581         /* every tblk who has COMMIT record on the current page,
582          * and has not been committed, must be on commit queue
583          * since tblk is queued at commit queueu at the time
584          * of writing its COMMIT record on the page before
585          * page becomes full (even though the tblk thread
586          * who wrote COMMIT record may have been suspended
587          * currently);
588          */
589
590         /* is page bound with outstanding tail tblk ? */
591         if (tblk && tblk->pn == pn) {
592                 /* mark tblk for end-of-page */
593                 tblk->flag |= tblkGC_EOP;
594
595                 if (log->cflag & logGC_PAGEOUT) {
596                         /* if page is not already on write queue,
597                          * just enqueue (no lbmWRITE to prevent redrive)
598                          * buffer to wqueue to ensure correct serial order
599                          * of the pages since log pages will be added
600                          * continuously
601                          */
602                         if (bp->l_wqnext == NULL)
603                                 lbmWrite(log, bp, 0, 0);
604                 } else {
605                         /*
606                          * No current GC leader, initiate group commit
607                          */
608                         log->cflag |= logGC_PAGEOUT;
609                         lmGCwrite(log, 0);
610                 }
611         }
612         /* page is not bound with outstanding tblk:
613          * init write or mark it to be redriven (lbmWRITE)
614          */
615         else {
616                 /* finalize the page */
617                 bp->l_ceor = bp->l_eor;
618                 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
619                 lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0);
620         }
621         LOGGC_UNLOCK(log);
622
623         /*
624          *      allocate/initialize next page
625          */
626         /* if log wraps, the first data page of log is 2
627          * (0 never used, 1 is superblock).
628          */
629         log->page = (pn == log->size - 1) ? 2 : pn + 1;
630         log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */
631
632         /* allocate/initialize next log page buffer */
633         nextbp = lbmAllocate(log, log->page);
634         nextbp->l_eor = log->eor;
635         log->bp = nextbp;
636
637         /* initialize next log page */
638         lp = (struct logpage *) nextbp->l_ldata;
639         lp->h.page = lp->t.page = cpu_to_le32(lspn + 1);
640         lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
641
642         return 0;
643 }
644
645
646 /*
647  * NAME:        lmGroupCommit()
648  *
649  * FUNCTION:    group commit
650  *      initiate pageout of the pages with COMMIT in the order of
651  *      page number - redrive pageout of the page at the head of
652  *      pageout queue until full page has been written.
653  *
654  * RETURN:      
655  *
656  * NOTE:
657  *      LOGGC_LOCK serializes log group commit queue, and
658  *      transaction blocks on the commit queue.
659  *      N.B. LOG_LOCK is NOT held during lmGroupCommit().
660  */
661 int lmGroupCommit(struct jfs_log * log, struct tblock * tblk)
662 {
663         int rc = 0;
664
665         LOGGC_LOCK(log);
666
667         /* group committed already ? */
668         if (tblk->flag & tblkGC_COMMITTED) {
669                 if (tblk->flag & tblkGC_ERROR)
670                         rc = -EIO;
671
672                 LOGGC_UNLOCK(log);
673                 return rc;
674         }
675         jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc);
676
677         if (tblk->xflag & COMMIT_LAZY)
678                 tblk->flag |= tblkGC_LAZY;
679
680         if ((!(log->cflag & logGC_PAGEOUT)) && log->cqueue.head &&
681             (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag))) {
682                 /*
683                  * No pageout in progress
684                  *
685                  * start group commit as its group leader.
686                  */
687                 log->cflag |= logGC_PAGEOUT;
688
689                 lmGCwrite(log, 0);
690         }
691
692         if (tblk->xflag & COMMIT_LAZY) {
693                 /*
694                  * Lazy transactions can leave now
695                  */
696                 LOGGC_UNLOCK(log);
697                 return 0;
698         }
699
700         /* lmGCwrite gives up LOGGC_LOCK, check again */
701
702         if (tblk->flag & tblkGC_COMMITTED) {
703                 if (tblk->flag & tblkGC_ERROR)
704                         rc = -EIO;
705
706                 LOGGC_UNLOCK(log);
707                 return rc;
708         }
709
710         /* upcount transaction waiting for completion
711          */
712         log->gcrtc++;
713         tblk->flag |= tblkGC_READY;
714
715         __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED),
716                      LOGGC_LOCK(log), LOGGC_UNLOCK(log));
717
718         /* removed from commit queue */
719         if (tblk->flag & tblkGC_ERROR)
720                 rc = -EIO;
721
722         LOGGC_UNLOCK(log);
723         return rc;
724 }
725
726 /*
727  * NAME:        lmGCwrite()
728  *
729  * FUNCTION:    group commit write
730  *      initiate write of log page, building a group of all transactions
731  *      with commit records on that page.
732  *
733  * RETURN:      None
734  *
735  * NOTE:
736  *      LOGGC_LOCK must be held by caller.
737  *      N.B. LOG_LOCK is NOT held during lmGroupCommit().
738  */
739 static void lmGCwrite(struct jfs_log * log, int cant_write)
740 {
741         struct lbuf *bp;
742         struct logpage *lp;
743         int gcpn;               /* group commit page number */
744         struct tblock *tblk;
745         struct tblock *xtblk;
746
747         /*
748          * build the commit group of a log page
749          *
750          * scan commit queue and make a commit group of all
751          * transactions with COMMIT records on the same log page.
752          */
753         /* get the head tblk on the commit queue */
754         tblk = xtblk = log->cqueue.head;
755         gcpn = tblk->pn;
756
757         while (tblk && tblk->pn == gcpn) {
758                 xtblk = tblk;
759
760                 /* state transition: (QUEUE, READY) -> COMMIT */
761                 tblk->flag |= tblkGC_COMMIT;
762                 tblk = tblk->cqnext;
763         }
764         tblk = xtblk;           /* last tblk of the page */
765
766         /*
767          * pageout to commit transactions on the log page.
768          */
769         bp = (struct lbuf *) tblk->bp;
770         lp = (struct logpage *) bp->l_ldata;
771         /* is page already full ? */
772         if (tblk->flag & tblkGC_EOP) {
773                 /* mark page to free at end of group commit of the page */
774                 tblk->flag &= ~tblkGC_EOP;
775                 tblk->flag |= tblkGC_FREE;
776                 bp->l_ceor = bp->l_eor;
777                 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
778                 lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC,
779                          cant_write);
780                 INCREMENT(lmStat.full_page);
781         }
782         /* page is not yet full */
783         else {
784                 bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */
785                 lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor);
786                 lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write);
787                 INCREMENT(lmStat.partial_page);
788         }
789 }
790
791 /*
792  * NAME:        lmPostGC()
793  *
794  * FUNCTION:    group commit post-processing
795  *      Processes transactions after their commit records have been written
796  *      to disk, redriving log I/O if necessary.
797  *
798  * RETURN:      None
799  *
800  * NOTE:
801  *      This routine is called a interrupt time by lbmIODone
802  */
803 void lmPostGC(struct lbuf * bp)
804 {
805         unsigned long flags;
806         struct jfs_log *log = bp->l_log;
807         struct logpage *lp;
808         struct tblock *tblk;
809
810         //LOGGC_LOCK(log);
811         spin_lock_irqsave(&log->gclock, flags);
812         /*
813          * current pageout of group commit completed.
814          *
815          * remove/wakeup transactions from commit queue who were
816          * group committed with the current log page
817          */
818         while ((tblk = log->cqueue.head) && (tblk->flag & tblkGC_COMMIT)) {
819                 /* if transaction was marked GC_COMMIT then
820                  * it has been shipped in the current pageout
821                  * and made it to disk - it is committed.
822                  */
823
824                 if (bp->l_flag & lbmERROR)
825                         tblk->flag |= tblkGC_ERROR;
826
827                 /* remove it from the commit queue */
828                 log->cqueue.head = tblk->cqnext;
829                 if (log->cqueue.head == NULL)
830                         log->cqueue.tail = NULL;
831                 tblk->flag &= ~tblkGC_QUEUE;
832                 tblk->cqnext = 0;
833
834                 if (tblk == log->flush_tblk) {
835                         /* we can stop flushing the log now */
836                         clear_bit(log_FLUSH, &log->flag);
837                         log->flush_tblk = NULL;
838                 }
839
840                 jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk,
841                          tblk->flag);
842
843                 if (!(tblk->xflag & COMMIT_FORCE))
844                         /*
845                          * Hand tblk over to lazy commit thread
846                          */
847                         txLazyUnlock(tblk);
848                 else {
849                         /* state transition: COMMIT -> COMMITTED */
850                         tblk->flag |= tblkGC_COMMITTED;
851
852                         if (tblk->flag & tblkGC_READY)
853                                 log->gcrtc--;
854
855                         LOGGC_WAKEUP(tblk);
856                 }
857
858                 /* was page full before pageout ?
859                  * (and this is the last tblk bound with the page)
860                  */
861                 if (tblk->flag & tblkGC_FREE)
862                         lbmFree(bp);
863                 /* did page become full after pageout ?
864                  * (and this is the last tblk bound with the page)
865                  */
866                 else if (tblk->flag & tblkGC_EOP) {
867                         /* finalize the page */
868                         lp = (struct logpage *) bp->l_ldata;
869                         bp->l_ceor = bp->l_eor;
870                         lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
871                         jfs_info("lmPostGC: calling lbmWrite");
872                         lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE,
873                                  1);
874                 }
875
876         }
877
878         /* are there any transactions who have entered lnGroupCommit()
879          * (whose COMMITs are after that of the last log page written.
880          * They are waiting for new group commit (above at (SLEEP 1))
881          * or lazy transactions are on a full (queued) log page,
882          * select the latest ready transaction as new group leader and
883          * wake her up to lead her group.
884          */
885         if ((tblk = log->cqueue.head) &&
886             ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) ||
887              test_bit(log_FLUSH, &log->flag)))
888                 /*
889                  * Call lmGCwrite with new group leader
890                  */
891                 lmGCwrite(log, 1);
892
893         /* no transaction are ready yet (transactions are only just
894          * queued (GC_QUEUE) and not entered for group commit yet).
895          * the first transaction entering group commit
896          * will elect herself as new group leader.
897          */
898         else
899                 log->cflag &= ~logGC_PAGEOUT;
900
901         //LOGGC_UNLOCK(log);
902         spin_unlock_irqrestore(&log->gclock, flags);
903         return;
904 }
905
906 /*
907  * NAME:        lmLogSync()
908  *
909  * FUNCTION:    write log SYNCPT record for specified log
910  *      if new sync address is available
911  *      (normally the case if sync() is executed by back-ground
912  *      process).
913  *      if not, explicitly run jfs_blogsync() to initiate
914  *      getting of new sync address.
915  *      calculate new value of i_nextsync which determines when
916  *      this code is called again.
917  *
918  *      this is called only from lmLog().
919  *
920  * PARAMETER:   ip      - pointer to logs inode.
921  *
922  * RETURN:      0
923  *                      
924  * serialization: LOG_LOCK() held on entry/exit
925  */
926 int lmLogSync(struct jfs_log * log, int nosyncwait)
927 {
928         int logsize;
929         int written;            /* written since last syncpt */
930         int free;               /* free space left available */
931         int delta;              /* additional delta to write normally */
932         int more;               /* additional write granted */
933         struct lrd lrd;
934         int lsn;
935         struct logsyncblk *lp;
936
937         /*
938          *      forward syncpt
939          */
940         /* if last sync is same as last syncpt,
941          * invoke sync point forward processing to update sync.
942          */
943
944         if (log->sync == log->syncpt) {
945                 LOGSYNC_LOCK(log);
946                 /* ToDo: push dirty metapages out to disk */
947 //              bmLogSync(log);
948
949                 if (list_empty(&log->synclist))
950                         log->sync = log->lsn;
951                 else {
952                         lp = list_entry(log->synclist.next,
953                                         struct logsyncblk, synclist);
954                         log->sync = lp->lsn;
955                 }
956                 LOGSYNC_UNLOCK(log);
957
958         }
959
960         /* if sync is different from last syncpt,
961          * write a SYNCPT record with syncpt = sync.
962          * reset syncpt = sync
963          */
964         if (log->sync != log->syncpt) {
965                 struct super_block *sb = log->sb;
966                 struct jfs_sb_info *sbi = JFS_SBI(sb);
967
968                 /*
969                  * We need to make sure all of the "written" metapages
970                  * actually make it to disk
971                  */
972                 filemap_fdatawrite(sbi->ipbmap->i_mapping);
973                 filemap_fdatawrite(sbi->ipimap->i_mapping);
974                 filemap_fdatawrite(sb->s_bdev->bd_inode->i_mapping);
975                 filemap_fdatawait(sbi->ipbmap->i_mapping);
976                 filemap_fdatawait(sbi->ipimap->i_mapping);
977                 filemap_fdatawait(sb->s_bdev->bd_inode->i_mapping);
978
979                 lrd.logtid = 0;
980                 lrd.backchain = 0;
981                 lrd.type = cpu_to_le16(LOG_SYNCPT);
982                 lrd.length = 0;
983                 lrd.log.syncpt.sync = cpu_to_le32(log->sync);
984                 lsn = lmWriteRecord(log, NULL, &lrd, NULL);
985
986                 log->syncpt = log->sync;
987         } else
988                 lsn = log->lsn;
989
990         /*
991          *      setup next syncpt trigger (SWAG)
992          */
993         logsize = log->logsize;
994
995         logdiff(written, lsn, log);
996         free = logsize - written;
997         delta = LOGSYNC_DELTA(logsize);
998         more = min(free / 2, delta);
999         if (more < 2 * LOGPSIZE) {
1000                 jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n");
1001                 /*
1002                  *      log wrapping
1003                  *
1004                  * option 1 - panic ? No.!
1005                  * option 2 - shutdown file systems
1006                  *            associated with log ?
1007                  * option 3 - extend log ?
1008                  */
1009                 /*
1010                  * option 4 - second chance
1011                  *
1012                  * mark log wrapped, and continue.
1013                  * when all active transactions are completed,
1014                  * mark log vaild for recovery.
1015                  * if crashed during invalid state, log state
1016                  * implies invald log, forcing fsck().
1017                  */
1018                 /* mark log state log wrap in log superblock */
1019                 /* log->state = LOGWRAP; */
1020
1021                 /* reset sync point computation */
1022                 log->syncpt = log->sync = lsn;
1023                 log->nextsync = delta;
1024         } else
1025                 /* next syncpt trigger = written + more */
1026                 log->nextsync = written + more;
1027
1028         /* return if lmLogSync() from outside of transaction, e.g., sync() */
1029         if (nosyncwait)
1030                 return lsn;
1031
1032         /* if number of bytes written from last sync point is more
1033          * than 1/4 of the log size, stop new transactions from
1034          * starting until all current transactions are completed
1035          * by setting syncbarrier flag.
1036          */
1037         if (written > LOGSYNC_BARRIER(logsize) && logsize > 32 * LOGPSIZE) {
1038                 set_bit(log_SYNCBARRIER, &log->flag);
1039                 jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn,
1040                          log->syncpt);
1041                 /*
1042                  * We may have to initiate group commit
1043                  */
1044                 jfs_flush_journal(log, 0);
1045         }
1046
1047         return lsn;
1048 }
1049
1050
1051 /*
1052  * NAME:        lmLogOpen()
1053  *
1054  * FUNCTION:    open the log on first open;
1055  *      insert filesystem in the active list of the log.
1056  *
1057  * PARAMETER:   ipmnt   - file system mount inode
1058  *              iplog   - log inode (out)
1059  *
1060  * RETURN:
1061  *
1062  * serialization:
1063  */
1064 int lmLogOpen(struct super_block *sb, struct jfs_log ** logptr)
1065 {
1066         int rc;
1067         struct block_device *bdev;
1068         struct jfs_log *log;
1069
1070         if (!(log = kmalloc(sizeof(struct jfs_log), GFP_KERNEL)))
1071                 return -ENOMEM;
1072         memset(log, 0, sizeof(struct jfs_log));
1073         init_waitqueue_head(&log->syncwait);
1074
1075         log->sb = sb;           /* This should be a list */
1076
1077         if (!(JFS_SBI(sb)->mntflag & JFS_INLINELOG))
1078                 goto externalLog;
1079
1080         /*
1081          *      in-line log in host file system
1082          *
1083          * file system to log have 1-to-1 relationship;
1084          */
1085
1086         set_bit(log_INLINELOG, &log->flag);
1087         log->bdev = sb->s_bdev;
1088         log->base = addressPXD(&JFS_SBI(sb)->logpxd);
1089         log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
1090             (L2LOGPSIZE - sb->s_blocksize_bits);
1091         log->l2bsize = sb->s_blocksize_bits;
1092         ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits);
1093
1094         /*
1095          * initialize log.
1096          */
1097         if ((rc = lmLogInit(log)))
1098                 goto free;
1099         goto out;
1100
1101         /*
1102          *      external log as separate logical volume
1103          *
1104          * file systems to log may have n-to-1 relationship;
1105          */
1106       externalLog:
1107
1108         bdev = open_by_devnum(JFS_SBI(sb)->logdev,
1109                                         FMODE_READ|FMODE_WRITE, BDEV_FS);
1110         if (IS_ERR(bdev)) {
1111                 rc = -PTR_ERR(bdev);
1112                 goto free;
1113         }
1114
1115         if ((rc = bd_claim(bdev, log))) {
1116                 goto close;
1117         }
1118
1119         log->bdev = bdev;
1120         memcpy(log->uuid, JFS_SBI(sb)->loguuid, sizeof(log->uuid));
1121         
1122         /*
1123          * initialize log:
1124          */
1125         if ((rc = lmLogInit(log)))
1126                 goto unclaim;
1127
1128         /*
1129          * add file system to log active file system list
1130          */
1131         if ((rc = lmLogFileSystem(log, JFS_SBI(sb)->uuid, 1)))
1132                 goto shutdown;
1133
1134       out:
1135         *logptr = log;
1136         return 0;
1137
1138         /*
1139          *      unwind on error
1140          */
1141       shutdown:         /* unwind lbmLogInit() */
1142         lbmLogShutdown(log);
1143
1144       unclaim:
1145         bd_release(bdev);
1146
1147       close:            /* close external log device */
1148         blkdev_put(bdev, BDEV_FS);
1149
1150       free:             /* free log descriptor */
1151         kfree(log);
1152
1153         jfs_warn("lmLogOpen: exit(%d)", rc);
1154         return rc;
1155 }
1156
1157
1158 /*
1159  * NAME:        lmLogInit()
1160  *
1161  * FUNCTION:    log initialization at first log open.
1162  *
1163  *      logredo() (or logformat()) should have been run previously.
1164  *      initialize the log inode from log superblock.
1165  *      set the log state in the superblock to LOGMOUNT and
1166  *      write SYNCPT log record.
1167  *              
1168  * PARAMETER:   log     - log structure
1169  *
1170  * RETURN:      0       - if ok
1171  *              -EINVAL - bad log magic number or superblock dirty
1172  *              error returned from logwait()
1173  *                      
1174  * serialization: single first open thread
1175  */
1176 int lmLogInit(struct jfs_log * log)
1177 {
1178         int rc = 0;
1179         struct lrd lrd;
1180         struct logsuper *logsuper;
1181         struct lbuf *bpsuper;
1182         struct lbuf *bp;
1183         struct logpage *lp;
1184         int lsn;
1185
1186         jfs_info("lmLogInit: log:0x%p", log);
1187
1188         /*
1189          * log inode is overlaid on generic inode where
1190          * dinode have been zeroed out by iRead();
1191          */
1192
1193         /*
1194          * initialize log i/o
1195          */
1196         if ((rc = lbmLogInit(log)))
1197                 return rc;
1198
1199         /*
1200          * validate log superblock
1201          */
1202         if (!test_bit(log_INLINELOG, &log->flag))
1203                 log->l2bsize = 12;      /* XXX kludge alert XXX */
1204         if ((rc = lbmRead(log, 1, &bpsuper)))
1205                 goto errout10;
1206
1207         logsuper = (struct logsuper *) bpsuper->l_ldata;
1208
1209         if (logsuper->magic != cpu_to_le32(LOGMAGIC)) {
1210                 jfs_warn("*** Log Format Error ! ***");
1211                 rc = -EINVAL;
1212                 goto errout20;
1213         }
1214
1215         /* logredo() should have been run successfully. */
1216         if (logsuper->state != cpu_to_le32(LOGREDONE)) {
1217                 jfs_warn("*** Log Is Dirty ! ***");
1218                 rc = -EINVAL;
1219                 goto errout20;
1220         }
1221
1222         /* initialize log inode from log superblock */
1223         if (test_bit(log_INLINELOG,&log->flag)) {
1224                 if (log->size != le32_to_cpu(logsuper->size)) {
1225                         rc = -EINVAL;
1226                         goto errout20;
1227                 }
1228                 jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x",
1229                         log, (unsigned long long) log->base, log->size);
1230         } else {
1231                 if (memcmp(logsuper->uuid, log->uuid, 16)) {
1232                         jfs_warn("wrong uuid on JFS log device");
1233                         goto errout20;
1234                 }
1235                 log->size = le32_to_cpu(logsuper->size);
1236                 log->l2bsize = le32_to_cpu(logsuper->l2bsize);
1237                 jfs_info("lmLogInit: external log:0x%p base:0x%Lx size:0x%x",
1238                         log, (unsigned long long) log->base, log->size);
1239         }
1240
1241         log->page = le32_to_cpu(logsuper->end) / LOGPSIZE;
1242         log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page);
1243
1244         /* check for disabled journaling to disk */
1245         if (JFS_SBI(log->sb)->flag & JFS_NOINTEGRITY) {
1246                 log->no_integrity = 1;
1247                 log->ni_page = log->page;
1248                 log->ni_eor = log->eor;
1249         }
1250         else
1251                 log->no_integrity = 0;
1252
1253         /*
1254          * initialize for log append write mode
1255          */
1256         /* establish current/end-of-log page/buffer */
1257         if ((rc = lbmRead(log, log->page, &bp)))
1258                 goto errout20;
1259
1260         lp = (struct logpage *) bp->l_ldata;
1261
1262         jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d",
1263                  le32_to_cpu(logsuper->end), log->page, log->eor,
1264                  le16_to_cpu(lp->h.eor));
1265
1266 //      ASSERT(log->eor == lp->h.eor);
1267
1268         log->bp = bp;
1269         bp->l_pn = log->page;
1270         bp->l_eor = log->eor;
1271
1272         /* initialize the group commit serialization lock */
1273         LOGGC_LOCK_INIT(log);
1274
1275         /* if current page is full, move on to next page */
1276         if (log->eor >= LOGPSIZE - LOGPTLRSIZE)
1277                 lmNextPage(log);
1278
1279         /* allocate/initialize the log write serialization lock */
1280         LOG_LOCK_INIT(log);
1281
1282         /*
1283          * initialize log syncpoint
1284          */
1285         /*
1286          * write the first SYNCPT record with syncpoint = 0
1287          * (i.e., log redo up to HERE !);
1288          * remove current page from lbm write queue at end of pageout
1289          * (to write log superblock update), but do not release to freelist;
1290          */
1291         lrd.logtid = 0;
1292         lrd.backchain = 0;
1293         lrd.type = cpu_to_le16(LOG_SYNCPT);
1294         lrd.length = 0;
1295         lrd.log.syncpt.sync = 0;
1296         lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1297         bp = log->bp;
1298         bp->l_ceor = bp->l_eor;
1299         lp = (struct logpage *) bp->l_ldata;
1300         lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
1301         lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0);
1302         if ((rc = lbmIOWait(bp, 0)))
1303                 goto errout30;
1304
1305         /* initialize logsync parameters */
1306         log->logsize = (log->size - 2) << L2LOGPSIZE;
1307         log->lsn = lsn;
1308         log->syncpt = lsn;
1309         log->sync = log->syncpt;
1310         log->nextsync = LOGSYNC_DELTA(log->logsize);
1311
1312         jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x",
1313                  log->lsn, log->syncpt, log->sync);
1314
1315         LOGSYNC_LOCK_INIT(log);
1316
1317         INIT_LIST_HEAD(&log->synclist);
1318
1319         log->cqueue.head = log->cqueue.tail = NULL;
1320         log->flush_tblk = NULL;
1321
1322         log->count = 0;
1323
1324         /*
1325          * initialize for lazy/group commit
1326          */
1327         log->clsn = lsn;
1328
1329         /*
1330          * update/write superblock
1331          */
1332         logsuper->state = cpu_to_le32(LOGMOUNT);
1333         log->serial = le32_to_cpu(logsuper->serial) + 1;
1334         logsuper->serial = cpu_to_le32(log->serial);
1335         lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1336         if ((rc = lbmIOWait(bpsuper, lbmFREE)))
1337                 goto errout30;
1338
1339         return 0;
1340
1341         /*
1342          *      unwind on error
1343          */
1344       errout30:         /* release log page */
1345         lbmFree(bp);
1346
1347       errout20:         /* release log superblock */
1348         lbmFree(bpsuper);
1349
1350       errout10:         /* unwind lbmLogInit() */
1351         lbmLogShutdown(log);
1352
1353         jfs_warn("lmLogInit: exit(%d)", rc);
1354         return rc;
1355 }
1356
1357
1358 /*
1359  * NAME:        lmLogClose()
1360  *
1361  * FUNCTION:    remove file system <ipmnt> from active list of log <iplog>
1362  *              and close it on last close.
1363  *
1364  * PARAMETER:   sb      - superblock
1365  *              log     - log inode
1366  *
1367  * RETURN:      errors from subroutines
1368  *
1369  * serialization:
1370  */
1371 int lmLogClose(struct super_block *sb, struct jfs_log * log)
1372 {
1373         struct block_device *bdev = log->bdev;
1374         int rc;
1375
1376         jfs_info("lmLogClose: log:0x%p", log);
1377
1378         if (!test_bit(log_INLINELOG, &log->flag))
1379                 goto externalLog;
1380         
1381         /*
1382          *      in-line log in host file system
1383          */
1384         rc = lmLogShutdown(log);
1385         goto out;
1386
1387         /*
1388          *      external log as separate logical volume
1389          */
1390       externalLog:
1391         lmLogFileSystem(log, JFS_SBI(sb)->uuid, 0);
1392         rc = lmLogShutdown(log);
1393
1394         bd_release(bdev);
1395         blkdev_put(bdev, BDEV_FS);
1396
1397       out:
1398         jfs_info("lmLogClose: exit(%d)", rc);
1399         return rc;
1400 }
1401
1402
1403 /*
1404  * NAME:        jfs_flush_journal()
1405  *
1406  * FUNCTION:    initiate write of any outstanding transactions to the journal
1407  *              and optionally wait until they are all written to disk
1408  *
1409  *              wait == 0  flush until latest txn is committed, don't wait
1410  *              wait == 1  flush until latest txn is committed, wait
1411  *              wait > 1   flush until all txn's are complete, wait
1412  */
1413 void jfs_flush_journal(struct jfs_log *log, int wait)
1414 {
1415         int i;
1416         struct tblock *target;
1417
1418         /* jfs_write_inode may call us during read-only mount */
1419         if (!log)
1420                 return;
1421
1422         jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait);
1423
1424         LOGGC_LOCK(log);
1425
1426         target = log->cqueue.head;
1427
1428         if (target) {
1429                 /*
1430                  * This ensures that we will keep writing to the journal as long
1431                  * as there are unwritten commit records
1432                  */
1433
1434                 if (test_bit(log_FLUSH, &log->flag)) {
1435                         /*
1436                          * We're already flushing.
1437                          * if flush_tblk is NULL, we are flushing everything,
1438                          * so leave it that way.  Otherwise, update it to the
1439                          * latest transaction
1440                          */
1441                         if (log->flush_tblk)
1442                                 log->flush_tblk = target;
1443                 } else {
1444                         /* Only flush until latest transaction is committed */
1445                         log->flush_tblk = target;
1446                         set_bit(log_FLUSH, &log->flag);
1447
1448                         /*
1449                          * Initiate I/O on outstanding transactions
1450                          */
1451                         if (!(log->cflag & logGC_PAGEOUT)) {
1452                                 log->cflag |= logGC_PAGEOUT;
1453                                 lmGCwrite(log, 0);
1454                         }
1455                 }
1456         }
1457         if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) {
1458                 /* Flush until all activity complete */
1459                 set_bit(log_FLUSH, &log->flag);
1460                 log->flush_tblk = NULL;
1461         }
1462
1463         if (wait && target && !(target->flag & tblkGC_COMMITTED)) {
1464                 DECLARE_WAITQUEUE(__wait, current);
1465
1466                 add_wait_queue(&target->gcwait, &__wait);
1467                 set_current_state(TASK_UNINTERRUPTIBLE);
1468                 LOGGC_UNLOCK(log);
1469                 schedule();
1470                 current->state = TASK_RUNNING;
1471                 LOGGC_LOCK(log);
1472                 remove_wait_queue(&target->gcwait, &__wait);
1473         }
1474         LOGGC_UNLOCK(log);
1475
1476         if (wait < 2)
1477                 return;
1478
1479         /*
1480          * If there was recent activity, we may need to wait
1481          * for the lazycommit thread to catch up
1482          */
1483         if (log->cqueue.head || !list_empty(&log->synclist)) {
1484                 for (i = 0; i < 800; i++) {     /* Too much? */
1485                         current->state = TASK_INTERRUPTIBLE;
1486                         schedule_timeout(HZ / 4);
1487                         if ((log->cqueue.head == NULL) &&
1488                             list_empty(&log->synclist))
1489                                 break;
1490                 }
1491         }
1492         assert(log->cqueue.head == NULL);
1493         assert(list_empty(&log->synclist));
1494         clear_bit(log_FLUSH, &log->flag);
1495 }
1496
1497 /*
1498  * NAME:        lmLogShutdown()
1499  *
1500  * FUNCTION:    log shutdown at last LogClose().
1501  *
1502  *              write log syncpt record.
1503  *              update super block to set redone flag to 0.
1504  *
1505  * PARAMETER:   log     - log inode
1506  *
1507  * RETURN:      0       - success
1508  *                      
1509  * serialization: single last close thread
1510  */
1511 int lmLogShutdown(struct jfs_log * log)
1512 {
1513         int rc;
1514         struct lrd lrd;
1515         int lsn;
1516         struct logsuper *logsuper;
1517         struct lbuf *bpsuper;
1518         struct lbuf *bp;
1519         struct logpage *lp;
1520
1521         jfs_info("lmLogShutdown: log:0x%p", log);
1522
1523         jfs_flush_journal(log, 2);
1524
1525         /*
1526          * We need to make sure all of the "written" metapages
1527          * actually make it to disk
1528          */
1529         sync_blockdev(log->sb->s_bdev);
1530
1531         /*
1532          * write the last SYNCPT record with syncpoint = 0
1533          * (i.e., log redo up to HERE !)
1534          */
1535         lrd.logtid = 0;
1536         lrd.backchain = 0;
1537         lrd.type = cpu_to_le16(LOG_SYNCPT);
1538         lrd.length = 0;
1539         lrd.log.syncpt.sync = 0;
1540         
1541         /* check for disabled journaling to disk */
1542         if (JFS_SBI(log->sb)->flag & JFS_NOINTEGRITY) {
1543                 log->no_integrity = 0;
1544                 log->page = log->ni_page;
1545                 log->eor = log->ni_eor;
1546         }
1547
1548         lsn = lmWriteRecord(log, NULL, &lrd, NULL);
1549         bp = log->bp;
1550         lp = (struct logpage *) bp->l_ldata;
1551         lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor);
1552         lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0);
1553         lbmIOWait(log->bp, lbmFREE);
1554
1555         /*
1556          * synchronous update log superblock
1557          * mark log state as shutdown cleanly
1558          * (i.e., Log does not need to be replayed).
1559          */
1560         if ((rc = lbmRead(log, 1, &bpsuper)))
1561                 goto out;
1562
1563         logsuper = (struct logsuper *) bpsuper->l_ldata;
1564         logsuper->state = cpu_to_le32(LOGREDONE);
1565         logsuper->end = cpu_to_le32(lsn);
1566         lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1567         rc = lbmIOWait(bpsuper, lbmFREE);
1568
1569         jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d",
1570                  lsn, log->page, log->eor);
1571
1572       out:    
1573         /*
1574          * shutdown per log i/o
1575          */
1576         lbmLogShutdown(log);
1577
1578         if (rc) {
1579                 jfs_warn("lmLogShutdown: exit(%d)", rc);
1580         }
1581         return rc;
1582 }
1583
1584
1585 /*
1586  * NAME:        lmLogFileSystem()
1587  *
1588  * FUNCTION:    insert (<activate> = true)/remove (<activate> = false)
1589  *      file system into/from log active file system list.
1590  *
1591  * PARAMETE:    log     - pointer to logs inode.
1592  *              fsdev   - kdev_t of filesystem.
1593  *              serial  - pointer to returned log serial number
1594  *              activate - insert/remove device from active list.
1595  *
1596  * RETURN:      0       - success
1597  *              errors returned by vms_iowait().
1598  */
1599 static int lmLogFileSystem(struct jfs_log * log, char *uuid, int activate)
1600 {
1601         int rc = 0;
1602         int i;
1603         struct logsuper *logsuper;
1604         struct lbuf *bpsuper;
1605
1606         /*
1607          * insert/remove file system device to log active file system list.
1608          */
1609         if ((rc = lbmRead(log, 1, &bpsuper)))
1610                 return rc;
1611
1612         logsuper = (struct logsuper *) bpsuper->l_ldata;
1613         if (activate) {
1614                 for (i = 0; i < MAX_ACTIVE; i++)
1615                         if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) {
1616                                 memcpy(logsuper->active[i].uuid, uuid, 16);
1617                                 break;
1618                         }
1619                 if (i == MAX_ACTIVE) {
1620                         jfs_warn("Too many file systems sharing journal!");
1621                         lbmFree(bpsuper);
1622                         return -EMFILE; /* Is there a better rc? */
1623                 }
1624         } else {
1625                 for (i = 0; i < MAX_ACTIVE; i++)
1626                         if (!memcmp(logsuper->active[i].uuid, uuid, 16)) {
1627                                 memcpy(logsuper->active[i].uuid, NULL_UUID, 16);
1628                                 break;
1629                         }
1630                 if (i == MAX_ACTIVE) {
1631                         jfs_warn("Somebody stomped on the journal!");
1632                         lbmFree(bpsuper);
1633                         return -EIO;
1634                 }
1635                 
1636         }
1637
1638         /*
1639          * synchronous write log superblock:
1640          *
1641          * write sidestream bypassing write queue:
1642          * at file system mount, log super block is updated for
1643          * activation of the file system before any log record
1644          * (MOUNT record) of the file system, and at file system
1645          * unmount, all meta data for the file system has been
1646          * flushed before log super block is updated for deactivation
1647          * of the file system.
1648          */
1649         lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC);
1650         rc = lbmIOWait(bpsuper, lbmFREE);
1651
1652         return rc;
1653 }
1654
1655 /*
1656  *              log buffer manager (lbm)
1657  *              ------------------------
1658  *
1659  * special purpose buffer manager supporting log i/o requirements.
1660  *
1661  * per log write queue:
1662  * log pageout occurs in serial order by fifo write queue and
1663  * restricting to a single i/o in pregress at any one time.
1664  * a circular singly-linked list
1665  * (log->wrqueue points to the tail, and buffers are linked via
1666  * bp->wrqueue field), and
1667  * maintains log page in pageout ot waiting for pageout in serial pageout.
1668  */
1669
1670 /*
1671  *      lbmLogInit()
1672  *
1673  * initialize per log I/O setup at lmLogInit()
1674  */
1675 static int lbmLogInit(struct jfs_log * log)
1676 {                               /* log inode */
1677         int i;
1678         struct lbuf *lbuf;
1679
1680         jfs_info("lbmLogInit: log:0x%p", log);
1681
1682         /* initialize current buffer cursor */
1683         log->bp = NULL;
1684
1685         /* initialize log device write queue */
1686         log->wqueue = NULL;
1687
1688         /*
1689          * Each log has its own buffer pages allocated to it.  These are
1690          * not managed by the page cache.  This ensures that a transaction
1691          * writing to the log does not block trying to allocate a page from
1692          * the page cache (for the log).  This would be bad, since page
1693          * allocation waits on the kswapd thread that may be committing inodes
1694          * which would cause log activity.  Was that clear?  I'm trying to
1695          * avoid deadlock here.
1696          */
1697         init_waitqueue_head(&log->free_wait);
1698
1699         log->lbuf_free = NULL;
1700
1701         for (i = 0; i < LOGPAGES; i++) {
1702                 lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
1703                 if (lbuf == 0)
1704                         goto error;
1705                 lbuf->l_ldata = (char *) __get_free_page(GFP_KERNEL);
1706                 if (lbuf->l_ldata == 0) {
1707                         kfree(lbuf);
1708                         goto error;
1709                 }
1710                 lbuf->l_log = log;
1711                 init_waitqueue_head(&lbuf->l_ioevent);
1712
1713                 lbuf->l_freelist = log->lbuf_free;
1714                 log->lbuf_free = lbuf;
1715         }
1716
1717         return (0);
1718
1719       error:
1720         lbmLogShutdown(log);
1721         return -ENOMEM;
1722 }
1723
1724
1725 /*
1726  *      lbmLogShutdown()
1727  *
1728  * finalize per log I/O setup at lmLogShutdown()
1729  */
1730 static void lbmLogShutdown(struct jfs_log * log)
1731 {
1732         struct lbuf *lbuf;
1733
1734         jfs_info("lbmLogShutdown: log:0x%p", log);
1735
1736         lbuf = log->lbuf_free;
1737         while (lbuf) {
1738                 struct lbuf *next = lbuf->l_freelist;
1739                 free_page((unsigned long) lbuf->l_ldata);
1740                 kfree(lbuf);
1741                 lbuf = next;
1742         }
1743
1744         log->bp = NULL;
1745 }
1746
1747
1748 /*
1749  *      lbmAllocate()
1750  *
1751  * allocate an empty log buffer
1752  */
1753 static struct lbuf *lbmAllocate(struct jfs_log * log, int pn)
1754 {
1755         struct lbuf *bp;
1756         unsigned long flags;
1757
1758         /*
1759          * recycle from log buffer freelist if any
1760          */
1761         LCACHE_LOCK(flags);
1762         LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags);
1763         log->lbuf_free = bp->l_freelist;
1764         LCACHE_UNLOCK(flags);
1765
1766         bp->l_flag = 0;
1767
1768         bp->l_wqnext = NULL;
1769         bp->l_freelist = NULL;
1770
1771         bp->l_pn = pn;
1772         bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize));
1773         bp->l_ceor = 0;
1774
1775         return bp;
1776 }
1777
1778
1779 /*
1780  *      lbmFree()
1781  *
1782  * release a log buffer to freelist
1783  */
1784 static void lbmFree(struct lbuf * bp)
1785 {
1786         unsigned long flags;
1787
1788         LCACHE_LOCK(flags);
1789
1790         lbmfree(bp);
1791
1792         LCACHE_UNLOCK(flags);
1793 }
1794
1795 static void lbmfree(struct lbuf * bp)
1796 {
1797         struct jfs_log *log = bp->l_log;
1798
1799         assert(bp->l_wqnext == NULL);
1800
1801         /*
1802          * return the buffer to head of freelist
1803          */
1804         bp->l_freelist = log->lbuf_free;
1805         log->lbuf_free = bp;
1806
1807         wake_up(&log->free_wait);
1808         return;
1809 }
1810
1811
1812 /*
1813  * NAME:        lbmRedrive
1814  *
1815  * FUNCTION:    add a log buffer to the the log redrive list
1816  *
1817  * PARAMETER:
1818  *     bp       - log buffer
1819  *
1820  * NOTES:
1821  *      Takes log_redrive_lock.
1822  */
1823 static inline void lbmRedrive(struct lbuf *bp)
1824 {
1825         unsigned long flags;
1826
1827         spin_lock_irqsave(&log_redrive_lock, flags);
1828         bp->l_redrive_next = log_redrive_list;
1829         log_redrive_list = bp;
1830         spin_unlock_irqrestore(&log_redrive_lock, flags);
1831
1832         wake_up(&jfs_IO_thread_wait);
1833 }
1834
1835
1836 /*
1837  *      lbmRead()
1838  */
1839 static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
1840 {
1841         struct bio *bio;
1842         struct lbuf *bp;
1843
1844         /*
1845          * allocate a log buffer
1846          */
1847         *bpp = bp = lbmAllocate(log, pn);
1848         jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn);
1849
1850         bp->l_flag |= lbmREAD;
1851
1852         bio = bio_alloc(GFP_NOFS, 1);
1853
1854         bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
1855         bio->bi_bdev = log->bdev;
1856         bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata);
1857         bio->bi_io_vec[0].bv_len = LOGPSIZE;
1858         bio->bi_io_vec[0].bv_offset = 0;
1859
1860         bio->bi_vcnt = 1;
1861         bio->bi_idx = 0;
1862         bio->bi_size = LOGPSIZE;
1863
1864         bio->bi_end_io = lbmIODone;
1865         bio->bi_private = bp;
1866         submit_bio(READ, bio);
1867         blk_run_queues();
1868
1869         wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
1870
1871         return 0;
1872 }
1873
1874
1875 /*
1876  *      lbmWrite()
1877  *
1878  * buffer at head of pageout queue stays after completion of
1879  * partial-page pageout and redriven by explicit initiation of
1880  * pageout by caller until full-page pageout is completed and
1881  * released.
1882  *
1883  * device driver i/o done redrives pageout of new buffer at
1884  * head of pageout queue when current buffer at head of pageout
1885  * queue is released at the completion of its full-page pageout.
1886  *
1887  * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit().
1888  * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone()
1889  */
1890 static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag,
1891                      int cant_block)
1892 {
1893         struct lbuf *tail;
1894         unsigned long flags;
1895
1896         jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn);
1897
1898         /* map the logical block address to physical block address */
1899         bp->l_blkno =
1900             log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
1901
1902         LCACHE_LOCK(flags);             /* disable+lock */
1903
1904         /*
1905          * initialize buffer for device driver
1906          */
1907         bp->l_flag = flag;
1908
1909         /*
1910          *      insert bp at tail of write queue associated with log
1911          *
1912          * (request is either for bp already/currently at head of queue
1913          * or new bp to be inserted at tail)
1914          */
1915         tail = log->wqueue;
1916
1917         /* is buffer not already on write queue ? */
1918         if (bp->l_wqnext == NULL) {
1919                 /* insert at tail of wqueue */
1920                 if (tail == NULL) {
1921                         log->wqueue = bp;
1922                         bp->l_wqnext = bp;
1923                 } else {
1924                         log->wqueue = bp;
1925                         bp->l_wqnext = tail->l_wqnext;
1926                         tail->l_wqnext = bp;
1927                 }
1928
1929                 tail = bp;
1930         }
1931
1932         /* is buffer at head of wqueue and for write ? */
1933         if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) {
1934                 LCACHE_UNLOCK(flags);   /* unlock+enable */
1935                 return;
1936         }
1937
1938         LCACHE_UNLOCK(flags);   /* unlock+enable */
1939
1940         if (cant_block)
1941                 lbmRedrive(bp);
1942         else if (flag & lbmSYNC)
1943                 lbmStartIO(bp);
1944         else {
1945                 LOGGC_UNLOCK(log);
1946                 lbmStartIO(bp);
1947                 LOGGC_LOCK(log);
1948         }
1949 }
1950
1951
1952 /*
1953  *      lbmDirectWrite()
1954  *
1955  * initiate pageout bypassing write queue for sidestream
1956  * (e.g., log superblock) write;
1957  */
1958 static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag)
1959 {
1960         jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x",
1961                  bp, flag, bp->l_pn);
1962
1963         /*
1964          * initialize buffer for device driver
1965          */
1966         bp->l_flag = flag | lbmDIRECT;
1967
1968         /* map the logical block address to physical block address */
1969         bp->l_blkno =
1970             log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize));
1971
1972         /*
1973          *      initiate pageout of the page
1974          */
1975         lbmStartIO(bp);
1976 }
1977
1978
1979 /*
1980  * NAME:        lbmStartIO()
1981  *
1982  * FUNCTION:    Interface to DD strategy routine
1983  *
1984  * RETURN:      none
1985  *
1986  * serialization: LCACHE_LOCK() is NOT held during log i/o;
1987  */
1988 static void lbmStartIO(struct lbuf * bp)
1989 {
1990         struct bio *bio;
1991         struct jfs_log *log = bp->l_log;
1992
1993         jfs_info("lbmStartIO\n");
1994
1995         bio = bio_alloc(GFP_NOFS, 1);
1996         bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
1997         bio->bi_bdev = log->bdev;
1998         bio->bi_io_vec[0].bv_page = virt_to_page(bp->l_ldata);
1999         bio->bi_io_vec[0].bv_len = LOGPSIZE;
2000         bio->bi_io_vec[0].bv_offset = 0;
2001
2002         bio->bi_vcnt = 1;
2003         bio->bi_idx = 0;
2004         bio->bi_size = LOGPSIZE;
2005
2006         bio->bi_end_io = lbmIODone;
2007         bio->bi_private = bp;
2008
2009         /* check if journaling to disk has been disabled */
2010         if (!log->no_integrity) {
2011                 submit_bio(WRITE, bio);
2012                 INCREMENT(lmStat.submitted);
2013                 blk_run_queues();
2014         }
2015         else {
2016                 bio->bi_size = 0;
2017                 lbmIODone(bio, 0, 0); /* 2nd argument appears to not be used => 0
2018                                        *  3rd argument appears to not be used => 0
2019                                        */
2020         }
2021 }
2022
2023
2024 /*
2025  *      lbmIOWait()
2026  */
2027 static int lbmIOWait(struct lbuf * bp, int flag)
2028 {
2029         unsigned long flags;
2030         int rc = 0;
2031
2032         jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
2033
2034         LCACHE_LOCK(flags);             /* disable+lock */
2035
2036         LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags);
2037
2038         rc = (bp->l_flag & lbmERROR) ? -EIO : 0;
2039
2040         if (flag & lbmFREE)
2041                 lbmfree(bp);
2042
2043         LCACHE_UNLOCK(flags);   /* unlock+enable */
2044
2045         jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag);
2046         return rc;
2047 }
2048
2049 /*
2050  *      lbmIODone()
2051  *
2052  * executed at INTIODONE level
2053  */
2054 static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
2055 {
2056         struct lbuf *bp = bio->bi_private;
2057         struct lbuf *nextbp, *tail;
2058         struct jfs_log *log;
2059         unsigned long flags;
2060
2061         if (bio->bi_size)
2062                 return 1;
2063
2064         /*
2065          * get back jfs buffer bound to the i/o buffer
2066          */
2067         jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag);
2068
2069         LCACHE_LOCK(flags);             /* disable+lock */
2070
2071         bp->l_flag |= lbmDONE;
2072
2073         if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2074                 bp->l_flag |= lbmERROR;
2075
2076                 jfs_err("lbmIODone: I/O error in JFS log");
2077         }
2078
2079         bio_put(bio);
2080
2081         /*
2082          *      pagein completion
2083          */
2084         if (bp->l_flag & lbmREAD) {
2085                 bp->l_flag &= ~lbmREAD;
2086
2087                 LCACHE_UNLOCK(flags);   /* unlock+enable */
2088
2089                 /* wakeup I/O initiator */
2090                 LCACHE_WAKEUP(&bp->l_ioevent);
2091
2092                 return 0;
2093         }
2094
2095         /*
2096          *      pageout completion
2097          *
2098          * the bp at the head of write queue has completed pageout.
2099          *
2100          * if single-commit/full-page pageout, remove the current buffer
2101          * from head of pageout queue, and redrive pageout with
2102          * the new buffer at head of pageout queue;
2103          * otherwise, the partial-page pageout buffer stays at
2104          * the head of pageout queue to be redriven for pageout
2105          * by lmGroupCommit() until full-page pageout is completed.
2106          */
2107         bp->l_flag &= ~lbmWRITE;
2108         INCREMENT(lmStat.pagedone);
2109
2110         /* update committed lsn */
2111         log = bp->l_log;
2112         log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor;
2113
2114         if (bp->l_flag & lbmDIRECT) {
2115                 LCACHE_WAKEUP(&bp->l_ioevent);
2116                 LCACHE_UNLOCK(flags);
2117                 return 0;
2118         }
2119
2120         tail = log->wqueue;
2121
2122         /* single element queue */
2123         if (bp == tail) {
2124                 /* remove head buffer of full-page pageout
2125                  * from log device write queue
2126                  */
2127                 if (bp->l_flag & lbmRELEASE) {
2128                         log->wqueue = NULL;
2129                         bp->l_wqnext = NULL;
2130                 }
2131         }
2132         /* multi element queue */
2133         else {
2134                 /* remove head buffer of full-page pageout
2135                  * from log device write queue
2136                  */
2137                 if (bp->l_flag & lbmRELEASE) {
2138                         nextbp = tail->l_wqnext = bp->l_wqnext;
2139                         bp->l_wqnext = NULL;
2140
2141                         /*
2142                          * redrive pageout of next page at head of write queue:
2143                          * redrive next page without any bound tblk
2144                          * (i.e., page w/o any COMMIT records), or
2145                          * first page of new group commit which has been
2146                          * queued after current page (subsequent pageout
2147                          * is performed synchronously, except page without
2148                          * any COMMITs) by lmGroupCommit() as indicated
2149                          * by lbmWRITE flag;
2150                          */
2151                         if (nextbp->l_flag & lbmWRITE) {
2152                                 /*
2153                                  * We can't do the I/O at interrupt time.
2154                                  * The jfsIO thread can do it
2155                                  */
2156                                 lbmRedrive(nextbp);
2157                         }
2158                 }
2159         }
2160
2161         /*
2162          *      synchronous pageout:
2163          *
2164          * buffer has not necessarily been removed from write queue
2165          * (e.g., synchronous write of partial-page with COMMIT):
2166          * leave buffer for i/o initiator to dispose
2167          */
2168         if (bp->l_flag & lbmSYNC) {
2169                 LCACHE_UNLOCK(flags);   /* unlock+enable */
2170
2171                 /* wakeup I/O initiator */
2172                 LCACHE_WAKEUP(&bp->l_ioevent);
2173         }
2174
2175         /*
2176          *      Group Commit pageout:
2177          */
2178         else if (bp->l_flag & lbmGC) {
2179                 LCACHE_UNLOCK(flags);
2180                 lmPostGC(bp);
2181         }
2182
2183         /*
2184          *      asynchronous pageout:
2185          *
2186          * buffer must have been removed from write queue:
2187          * insert buffer at head of freelist where it can be recycled
2188          */
2189         else {
2190                 assert(bp->l_flag & lbmRELEASE);
2191                 assert(bp->l_flag & lbmFREE);
2192                 lbmfree(bp);
2193
2194                 LCACHE_UNLOCK(flags);   /* unlock+enable */
2195         }
2196
2197         return 0;
2198 }
2199
2200 int jfsIOWait(void *arg)
2201 {
2202         struct lbuf *bp;
2203
2204         daemonize("jfsIO");
2205
2206         complete(&jfsIOwait);
2207
2208         do {
2209                 DECLARE_WAITQUEUE(wq, current);
2210
2211                 spin_lock_irq(&log_redrive_lock);
2212                 while ((bp = log_redrive_list)) {
2213                         log_redrive_list = bp->l_redrive_next;
2214                         bp->l_redrive_next = NULL;
2215                         spin_unlock_irq(&log_redrive_lock);
2216                         lbmStartIO(bp);
2217                         spin_lock_irq(&log_redrive_lock);
2218                 }
2219                 if (current->flags & PF_FREEZE) {
2220                         spin_unlock_irq(&log_redrive_lock);
2221                         refrigerator(PF_IOTHREAD);
2222                 } else {
2223                         add_wait_queue(&jfs_IO_thread_wait, &wq);
2224                         set_current_state(TASK_INTERRUPTIBLE);
2225                         spin_unlock_irq(&log_redrive_lock);
2226                         schedule();
2227                         current->state = TASK_RUNNING;
2228                         remove_wait_queue(&jfs_IO_thread_wait, &wq);
2229                 }
2230         } while (!jfs_stop_threads);
2231
2232         jfs_info("jfsIOWait being killed!");
2233         complete_and_exit(&jfsIOwait, 0);
2234 }
2235
2236 /*
2237  * NAME:        lmLogFormat()/jfs_logform()
2238  *
2239  * FUNCTION:    format file system log
2240  *
2241  * PARAMETERS:
2242  *      log     - volume log
2243  *      logAddress - start address of log space in FS block
2244  *      logSize - length of log space in FS block;
2245  *
2246  * RETURN:      0       - success
2247  *              -EIO    - i/o error
2248  *
2249  * XXX: We're synchronously writing one page at a time.  This needs to
2250  *      be improved by writing multiple pages at once.
2251  */
2252 int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize)
2253 {
2254         int rc = -EIO;
2255         struct jfs_sb_info *sbi = JFS_SBI(log->sb);
2256         struct logsuper *logsuper;
2257         struct logpage *lp;
2258         int lspn;               /* log sequence page number */
2259         struct lrd *lrd_ptr;
2260         int npages = 0;
2261         struct lbuf *bp;
2262
2263         jfs_info("lmLogFormat: logAddress:%Ld logSize:%d",
2264                  (long long)logAddress, logSize);
2265
2266         /* allocate a log buffer */
2267         bp = lbmAllocate(log, 1);
2268
2269         npages = logSize >> sbi->l2nbperpage;
2270
2271         /*
2272          *      log space:
2273          *
2274          * page 0 - reserved;
2275          * page 1 - log superblock;
2276          * page 2 - log data page: A SYNC log record is written
2277          *          into this page at logform time;
2278          * pages 3-N - log data page: set to empty log data pages;
2279          */
2280         /*
2281          *      init log superblock: log page 1
2282          */
2283         logsuper = (struct logsuper *) bp->l_ldata;
2284
2285         logsuper->magic = cpu_to_le32(LOGMAGIC);
2286         logsuper->version = cpu_to_le32(LOGVERSION);
2287         logsuper->state = cpu_to_le32(LOGREDONE);
2288         logsuper->flag = cpu_to_le32(sbi->mntflag);     /* ? */
2289         logsuper->size = cpu_to_le32(npages);
2290         logsuper->bsize = cpu_to_le32(sbi->bsize);
2291         logsuper->l2bsize = cpu_to_le32(sbi->l2bsize);
2292         logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE);
2293
2294         bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2295         bp->l_blkno = logAddress + sbi->nbperpage;
2296         lbmStartIO(bp);
2297         if ((rc = lbmIOWait(bp, 0)))
2298                 goto exit;
2299
2300         /*
2301          *      init pages 2 to npages-1 as log data pages:
2302          *
2303          * log page sequence number (lpsn) initialization:
2304          *
2305          * pn:   0     1     2     3                 n-1
2306          *       +-----+-----+=====+=====+===.....===+=====+
2307          * lspn:             N-1   0     1           N-2
2308          *                   <--- N page circular file ---->
2309          *
2310          * the N (= npages-2) data pages of the log is maintained as
2311          * a circular file for the log records;
2312          * lpsn grows by 1 monotonically as each log page is written
2313          * to the circular file of the log;
2314          * and setLogpage() will not reset the page number even if
2315          * the eor is equal to LOGPHDRSIZE. In order for binary search
2316          * still work in find log end process, we have to simulate the
2317          * log wrap situation at the log format time.
2318          * The 1st log page written will have the highest lpsn. Then
2319          * the succeeding log pages will have ascending order of
2320          * the lspn starting from 0, ... (N-2)
2321          */
2322         lp = (struct logpage *) bp->l_ldata;
2323         /*
2324          * initialize 1st log page to be written: lpsn = N - 1,
2325          * write a SYNCPT log record is written to this page
2326          */
2327         lp->h.page = lp->t.page = cpu_to_le32(npages - 3);
2328         lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE);
2329
2330         lrd_ptr = (struct lrd *) &lp->data;
2331         lrd_ptr->logtid = 0;
2332         lrd_ptr->backchain = 0;
2333         lrd_ptr->type = cpu_to_le16(LOG_SYNCPT);
2334         lrd_ptr->length = 0;
2335         lrd_ptr->log.syncpt.sync = 0;
2336
2337         bp->l_blkno += sbi->nbperpage;
2338         bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2339         lbmStartIO(bp);
2340         if ((rc = lbmIOWait(bp, 0)))
2341                 goto exit;
2342
2343         /*
2344          *      initialize succeeding log pages: lpsn = 0, 1, ..., (N-2)
2345          */
2346         for (lspn = 0; lspn < npages - 3; lspn++) {
2347                 lp->h.page = lp->t.page = cpu_to_le32(lspn);
2348                 lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE);
2349
2350                 bp->l_blkno += sbi->nbperpage;
2351                 bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT;
2352                 lbmStartIO(bp);
2353                 if ((rc = lbmIOWait(bp, 0)))
2354                         goto exit;
2355         }
2356
2357         rc = 0;
2358 exit:
2359         /*
2360          *      finalize log
2361          */
2362         /* release the buffer */
2363         lbmFree(bp);
2364
2365         return rc;
2366 }
2367
2368 #ifdef CONFIG_JFS_STATISTICS
2369 int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
2370                       int *eof, void *data)
2371 {
2372         int len = 0;
2373         off_t begin;
2374
2375         len += sprintf(buffer,
2376                        "JFS Logmgr stats\n"
2377                        "================\n"
2378                        "commits = %d\n"
2379                        "writes submitted = %d\n"
2380                        "writes completed = %d\n"
2381                        "full pages submitted = %d\n"
2382                        "partial pages submitted = %d\n",
2383                        lmStat.commit,
2384                        lmStat.submitted,
2385                        lmStat.pagedone,
2386                        lmStat.full_page,
2387                        lmStat.partial_page);
2388
2389         begin = offset;
2390         *start = buffer + begin;
2391         len -= begin;
2392
2393         if (len > length)
2394                 len = length;
2395         else
2396                 *eof = 1;
2397
2398         if (len < 0)
2399                 len = 0;
2400
2401         return len;
2402 }
2403 #endif /* CONFIG_JFS_STATISTICS */