6591cb21edf6597fbd04d3bbd31695745dfc1b38
[linux-flexiantxendom0-natty.git] / fs / reiserfs / fix_node.c
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 /**
6  ** old_item_num
7  ** old_entry_num
8  ** set_entry_sizes
9  ** create_virtual_node
10  ** check_left
11  ** check_right
12  ** directory_part_size
13  ** get_num_ver
14  ** set_parameters
15  ** is_leaf_removable
16  ** are_leaves_removable
17  ** get_empty_nodes
18  ** get_lfree
19  ** get_rfree
20  ** is_left_neighbor_in_cache
21  ** decrement_key
22  ** get_far_parent
23  ** get_parents
24  ** can_node_be_removed
25  ** ip_check_balance
26  ** dc_check_balance_internal
27  ** dc_check_balance_leaf
28  ** dc_check_balance
29  ** check_balance
30  ** get_direct_parent
31  ** get_neighbors
32  ** fix_nodes
33  **
34  **
35  **/
36
37 #include <linux/time.h>
38 #include <linux/string.h>
39 #include <linux/reiserfs_fs.h>
40 #include <linux/buffer_head.h>
41
42 /* To make any changes in the tree we find a node, that contains item
43    to be changed/deleted or position in the node we insert a new item
44    to. We call this node S. To do balancing we need to decide what we
45    will shift to left/right neighbor, or to a new node, where new item
46    will be etc. To make this analysis simpler we build virtual
47    node. Virtual node is an array of items, that will replace items of
48    node S. (For instance if we are going to delete an item, virtual
49    node does not contain it). Virtual node keeps information about
50    item sizes and types, mergeability of first and last items, sizes
51    of all entries in directory item. We use this array of items when
52    calculating what we can shift to neighbors and how many nodes we
53    have to have if we do not any shiftings, if we shift to left/right
54    neighbor or to both. */
55
56 /* taking item number in virtual node, returns number of item, that it has in source buffer */
57 static inline int old_item_num(int new_num, int affected_item_num, int mode)
58 {
59         if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
60                 return new_num;
61
62         if (mode == M_INSERT) {
63
64                 RFALSE(new_num == 0,
65                        "vs-8005: for INSERT mode and item number of inserted item");
66
67                 return new_num - 1;
68         }
69
70         RFALSE(mode != M_DELETE,
71                "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
72                mode);
73         /* delete mode */
74         return new_num + 1;
75 }
76
77 static void create_virtual_node(struct tree_balance *tb, int h)
78 {
79         struct item_head *ih;
80         struct virtual_node *vn = tb->tb_vn;
81         int new_num;
82         struct buffer_head *Sh; /* this comes from tb->S[h] */
83
84         Sh = PATH_H_PBUFFER(tb->tb_path, h);
85
86         /* size of changed node */
87         vn->vn_size =
88             MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
89
90         /* for internal nodes array if virtual items is not created */
91         if (h) {
92                 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
93                 return;
94         }
95
96         /* number of items in virtual node  */
97         vn->vn_nr_item =
98             B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
99             ((vn->vn_mode == M_DELETE) ? 1 : 0);
100
101         /* first virtual item */
102         vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103         memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
104         vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
105
106         /* first item in the node */
107         ih = B_N_PITEM_HEAD(Sh, 0);
108
109         /* define the mergeability for 0-th item (if it is not being deleted) */
110         if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
111             && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112                 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
113
114         /* go through all items those remain in the virtual node (except for the new (inserted) one) */
115         for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
116                 int j;
117                 struct virtual_item *vi = vn->vn_vi + new_num;
118                 int is_affected =
119                     ((new_num != vn->vn_affected_item_num) ? 0 : 1);
120
121                 if (is_affected && vn->vn_mode == M_INSERT)
122                         continue;
123
124                 /* get item number in source node */
125                 j = old_item_num(new_num, vn->vn_affected_item_num,
126                                  vn->vn_mode);
127
128                 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
129                 vi->vi_ih = ih + j;
130                 vi->vi_item = B_I_PITEM(Sh, ih + j);
131                 vi->vi_uarea = vn->vn_free_ptr;
132
133                 // FIXME: there is no check, that item operation did not
134                 // consume too much memory
135                 vn->vn_free_ptr +=
136                     op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
137                 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
138                         reiserfs_panic(tb->tb_sb, "vs-8030",
139                                        "virtual node space consumed");
140
141                 if (!is_affected)
142                         /* this is not being changed */
143                         continue;
144
145                 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
146                         vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
147                         vi->vi_new_data = vn->vn_data;  // pointer to data which is going to be pasted
148                 }
149         }
150
151         /* virtual inserted item is not defined yet */
152         if (vn->vn_mode == M_INSERT) {
153                 struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
154
155                 RFALSE(vn->vn_ins_ih == NULL,
156                        "vs-8040: item header of inserted item is not specified");
157                 vi->vi_item_len = tb->insert_size[0];
158                 vi->vi_ih = vn->vn_ins_ih;
159                 vi->vi_item = vn->vn_data;
160                 vi->vi_uarea = vn->vn_free_ptr;
161
162                 op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
163                              tb->insert_size[0]);
164         }
165
166         /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
167         if (tb->CFR[0]) {
168                 struct reiserfs_key *key;
169
170                 key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
171                 if (op_is_left_mergeable(key, Sh->b_size)
172                     && (vn->vn_mode != M_DELETE
173                         || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
174                         vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
175                             VI_TYPE_RIGHT_MERGEABLE;
176
177 #ifdef CONFIG_REISERFS_CHECK
178                 if (op_is_left_mergeable(key, Sh->b_size) &&
179                     !(vn->vn_mode != M_DELETE
180                       || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
181                         /* we delete last item and it could be merged with right neighbor's first item */
182                         if (!
183                             (B_NR_ITEMS(Sh) == 1
184                              && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
185                              && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
186                                 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
187                                 print_block(Sh, 0, -1, -1);
188                                 reiserfs_panic(tb->tb_sb, "vs-8045",
189                                                "rdkey %k, affected item==%d "
190                                                "(mode==%c) Must be %c",
191                                                key, vn->vn_affected_item_num,
192                                                vn->vn_mode, M_DELETE);
193                         }
194                 }
195 #endif
196
197         }
198 }
199
200 /* using virtual node check, how many items can be shifted to left
201    neighbor */
202 static void check_left(struct tree_balance *tb, int h, int cur_free)
203 {
204         int i;
205         struct virtual_node *vn = tb->tb_vn;
206         struct virtual_item *vi;
207         int d_size, ih_size;
208
209         RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
210
211         /* internal level */
212         if (h > 0) {
213                 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
214                 return;
215         }
216
217         /* leaf level */
218
219         if (!cur_free || !vn->vn_nr_item) {
220                 /* no free space or nothing to move */
221                 tb->lnum[h] = 0;
222                 tb->lbytes = -1;
223                 return;
224         }
225
226         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
227                "vs-8055: parent does not exist or invalid");
228
229         vi = vn->vn_vi;
230         if ((unsigned int)cur_free >=
231             (vn->vn_size -
232              ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
233                 /* all contents of S[0] fits into L[0] */
234
235                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
236                        "vs-8055: invalid mode or balance condition failed");
237
238                 tb->lnum[0] = vn->vn_nr_item;
239                 tb->lbytes = -1;
240                 return;
241         }
242
243         d_size = 0, ih_size = IH_SIZE;
244
245         /* first item may be merge with last item in left neighbor */
246         if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
247                 d_size = -((int)IH_SIZE), ih_size = 0;
248
249         tb->lnum[0] = 0;
250         for (i = 0; i < vn->vn_nr_item;
251              i++, ih_size = IH_SIZE, d_size = 0, vi++) {
252                 d_size += vi->vi_item_len;
253                 if (cur_free >= d_size) {
254                         /* the item can be shifted entirely */
255                         cur_free -= d_size;
256                         tb->lnum[0]++;
257                         continue;
258                 }
259
260                 /* the item cannot be shifted entirely, try to split it */
261                 /* check whether L[0] can hold ih and at least one byte of the item body */
262                 if (cur_free <= ih_size) {
263                         /* cannot shift even a part of the current item */
264                         tb->lbytes = -1;
265                         return;
266                 }
267                 cur_free -= ih_size;
268
269                 tb->lbytes = op_check_left(vi, cur_free, 0, 0);
270                 if (tb->lbytes != -1)
271                         /* count partially shifted item */
272                         tb->lnum[0]++;
273
274                 break;
275         }
276
277         return;
278 }
279
280 /* using virtual node check, how many items can be shifted to right
281    neighbor */
282 static void check_right(struct tree_balance *tb, int h, int cur_free)
283 {
284         int i;
285         struct virtual_node *vn = tb->tb_vn;
286         struct virtual_item *vi;
287         int d_size, ih_size;
288
289         RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
290
291         /* internal level */
292         if (h > 0) {
293                 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
294                 return;
295         }
296
297         /* leaf level */
298
299         if (!cur_free || !vn->vn_nr_item) {
300                 /* no free space  */
301                 tb->rnum[h] = 0;
302                 tb->rbytes = -1;
303                 return;
304         }
305
306         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
307                "vs-8075: parent does not exist or invalid");
308
309         vi = vn->vn_vi + vn->vn_nr_item - 1;
310         if ((unsigned int)cur_free >=
311             (vn->vn_size -
312              ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
313                 /* all contents of S[0] fits into R[0] */
314
315                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
316                        "vs-8080: invalid mode or balance condition failed");
317
318                 tb->rnum[h] = vn->vn_nr_item;
319                 tb->rbytes = -1;
320                 return;
321         }
322
323         d_size = 0, ih_size = IH_SIZE;
324
325         /* last item may be merge with first item in right neighbor */
326         if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
327                 d_size = -(int)IH_SIZE, ih_size = 0;
328
329         tb->rnum[0] = 0;
330         for (i = vn->vn_nr_item - 1; i >= 0;
331              i--, d_size = 0, ih_size = IH_SIZE, vi--) {
332                 d_size += vi->vi_item_len;
333                 if (cur_free >= d_size) {
334                         /* the item can be shifted entirely */
335                         cur_free -= d_size;
336                         tb->rnum[0]++;
337                         continue;
338                 }
339
340                 /* check whether R[0] can hold ih and at least one byte of the item body */
341                 if (cur_free <= ih_size) {      /* cannot shift even a part of the current item */
342                         tb->rbytes = -1;
343                         return;
344                 }
345
346                 /* R[0] can hold the header of the item and at least one byte of its body */
347                 cur_free -= ih_size;    /* cur_free is still > 0 */
348
349                 tb->rbytes = op_check_right(vi, cur_free);
350                 if (tb->rbytes != -1)
351                         /* count partially shifted item */
352                         tb->rnum[0]++;
353
354                 break;
355         }
356
357         return;
358 }
359
360 /*
361  * from - number of items, which are shifted to left neighbor entirely
362  * to - number of item, which are shifted to right neighbor entirely
363  * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
364  * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
365 static int get_num_ver(int mode, struct tree_balance *tb, int h,
366                        int from, int from_bytes,
367                        int to, int to_bytes, short *snum012, int flow)
368 {
369         int i;
370         int cur_free;
371         //    int bytes;
372         int units;
373         struct virtual_node *vn = tb->tb_vn;
374         //    struct virtual_item * vi;
375
376         int total_node_size, max_node_size, current_item_size;
377         int needed_nodes;
378         int start_item,         /* position of item we start filling node from */
379          end_item,              /* position of item we finish filling node by */
380          start_bytes,           /* number of first bytes (entries for directory) of start_item-th item
381                                    we do not include into node that is being filled */
382          end_bytes;             /* number of last bytes (entries for directory) of end_item-th item
383                                    we do node include into node that is being filled */
384         int split_item_positions[2];    /* these are positions in virtual item of
385                                            items, that are split between S[0] and
386                                            S1new and S1new and S2new */
387
388         split_item_positions[0] = -1;
389         split_item_positions[1] = -1;
390
391         /* We only create additional nodes if we are in insert or paste mode
392            or we are in replace mode at the internal level. If h is 0 and
393            the mode is M_REPLACE then in fix_nodes we change the mode to
394            paste or insert before we get here in the code.  */
395         RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
396                "vs-8100: insert_size < 0 in overflow");
397
398         max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
399
400         /* snum012 [0-2] - number of items, that lay
401            to S[0], first new node and second new node */
402         snum012[3] = -1;        /* s1bytes */
403         snum012[4] = -1;        /* s2bytes */
404
405         /* internal level */
406         if (h > 0) {
407                 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
408                 if (i == max_node_size)
409                         return 1;
410                 return (i / max_node_size + 1);
411         }
412
413         /* leaf level */
414         needed_nodes = 1;
415         total_node_size = 0;
416         cur_free = max_node_size;
417
418         // start from 'from'-th item
419         start_item = from;
420         // skip its first 'start_bytes' units
421         start_bytes = ((from_bytes != -1) ? from_bytes : 0);
422
423         // last included item is the 'end_item'-th one
424         end_item = vn->vn_nr_item - to - 1;
425         // do not count last 'end_bytes' units of 'end_item'-th item
426         end_bytes = (to_bytes != -1) ? to_bytes : 0;
427
428         /* go through all item beginning from the start_item-th item and ending by
429            the end_item-th item. Do not count first 'start_bytes' units of
430            'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
431
432         for (i = start_item; i <= end_item; i++) {
433                 struct virtual_item *vi = vn->vn_vi + i;
434                 int skip_from_end = ((i == end_item) ? end_bytes : 0);
435
436                 RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
437
438                 /* get size of current item */
439                 current_item_size = vi->vi_item_len;
440
441                 /* do not take in calculation head part (from_bytes) of from-th item */
442                 current_item_size -=
443                     op_part_size(vi, 0 /*from start */ , start_bytes);
444
445                 /* do not take in calculation tail part of last item */
446                 current_item_size -=
447                     op_part_size(vi, 1 /*from end */ , skip_from_end);
448
449                 /* if item fits into current node entierly */
450                 if (total_node_size + current_item_size <= max_node_size) {
451                         snum012[needed_nodes - 1]++;
452                         total_node_size += current_item_size;
453                         start_bytes = 0;
454                         continue;
455                 }
456
457                 if (current_item_size > max_node_size) {
458                         /* virtual item length is longer, than max size of item in
459                            a node. It is impossible for direct item */
460                         RFALSE(is_direct_le_ih(vi->vi_ih),
461                                "vs-8110: "
462                                "direct item length is %d. It can not be longer than %d",
463                                current_item_size, max_node_size);
464                         /* we will try to split it */
465                         flow = 1;
466                 }
467
468                 if (!flow) {
469                         /* as we do not split items, take new node and continue */
470                         needed_nodes++;
471                         i--;
472                         total_node_size = 0;
473                         continue;
474                 }
475                 // calculate number of item units which fit into node being
476                 // filled
477                 {
478                         int free_space;
479
480                         free_space = max_node_size - total_node_size - IH_SIZE;
481                         units =
482                             op_check_left(vi, free_space, start_bytes,
483                                           skip_from_end);
484                         if (units == -1) {
485                                 /* nothing fits into current node, take new node and continue */
486                                 needed_nodes++, i--, total_node_size = 0;
487                                 continue;
488                         }
489                 }
490
491                 /* something fits into the current node */
492                 //if (snum012[3] != -1 || needed_nodes != 1)
493                 //  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
494                 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
495                 start_bytes += units;
496                 snum012[needed_nodes - 1 + 3] = units;
497
498                 if (needed_nodes > 2)
499                         reiserfs_warning(tb->tb_sb, "vs-8111",
500                                          "split_item_position is out of range");
501                 snum012[needed_nodes - 1]++;
502                 split_item_positions[needed_nodes - 1] = i;
503                 needed_nodes++;
504                 /* continue from the same item with start_bytes != -1 */
505                 start_item = i;
506                 i--;
507                 total_node_size = 0;
508         }
509
510         // sum012[4] (if it is not -1) contains number of units of which
511         // are to be in S1new, snum012[3] - to be in S0. They are supposed
512         // to be S1bytes and S2bytes correspondingly, so recalculate
513         if (snum012[4] > 0) {
514                 int split_item_num;
515                 int bytes_to_r, bytes_to_l;
516                 int bytes_to_S1new;
517
518                 split_item_num = split_item_positions[1];
519                 bytes_to_l =
520                     ((from == split_item_num
521                       && from_bytes != -1) ? from_bytes : 0);
522                 bytes_to_r =
523                     ((end_item == split_item_num
524                       && end_bytes != -1) ? end_bytes : 0);
525                 bytes_to_S1new =
526                     ((split_item_positions[0] ==
527                       split_item_positions[1]) ? snum012[3] : 0);
528
529                 // s2bytes
530                 snum012[4] =
531                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
532                     bytes_to_r - bytes_to_l - bytes_to_S1new;
533
534                 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
535                     vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
536                         reiserfs_warning(tb->tb_sb, "vs-8115",
537                                          "not directory or indirect item");
538         }
539
540         /* now we know S2bytes, calculate S1bytes */
541         if (snum012[3] > 0) {
542                 int split_item_num;
543                 int bytes_to_r, bytes_to_l;
544                 int bytes_to_S2new;
545
546                 split_item_num = split_item_positions[0];
547                 bytes_to_l =
548                     ((from == split_item_num
549                       && from_bytes != -1) ? from_bytes : 0);
550                 bytes_to_r =
551                     ((end_item == split_item_num
552                       && end_bytes != -1) ? end_bytes : 0);
553                 bytes_to_S2new =
554                     ((split_item_positions[0] == split_item_positions[1]
555                       && snum012[4] != -1) ? snum012[4] : 0);
556
557                 // s1bytes
558                 snum012[3] =
559                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
560                     bytes_to_r - bytes_to_l - bytes_to_S2new;
561         }
562
563         return needed_nodes;
564 }
565
566
567 /* Set parameters for balancing.
568  * Performs write of results of analysis of balancing into structure tb,
569  * where it will later be used by the functions that actually do the balancing.
570  * Parameters:
571  *      tb      tree_balance structure;
572  *      h       current level of the node;
573  *      lnum    number of items from S[h] that must be shifted to L[h];
574  *      rnum    number of items from S[h] that must be shifted to R[h];
575  *      blk_num number of blocks that S[h] will be splitted into;
576  *      s012    number of items that fall into splitted nodes.
577  *      lbytes  number of bytes which flow to the left neighbor from the item that is not
578  *              not shifted entirely
579  *      rbytes  number of bytes which flow to the right neighbor from the item that is not
580  *              not shifted entirely
581  *      s1bytes number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
582  */
583
584 static void set_parameters(struct tree_balance *tb, int h, int lnum,
585                            int rnum, int blk_num, short *s012, int lb, int rb)
586 {
587
588         tb->lnum[h] = lnum;
589         tb->rnum[h] = rnum;
590         tb->blknum[h] = blk_num;
591
592         if (h == 0) {           /* only for leaf level */
593                 if (s012 != NULL) {
594                         tb->s0num = *s012++,
595                             tb->s1num = *s012++, tb->s2num = *s012++;
596                         tb->s1bytes = *s012++;
597                         tb->s2bytes = *s012;
598                 }
599                 tb->lbytes = lb;
600                 tb->rbytes = rb;
601         }
602         PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
603         PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
604
605         PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
606         PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
607 }
608
609 /* check, does node disappear if we shift tb->lnum[0] items to left
610    neighbor and tb->rnum[0] to the right one. */
611 static int is_leaf_removable(struct tree_balance *tb)
612 {
613         struct virtual_node *vn = tb->tb_vn;
614         int to_left, to_right;
615         int size;
616         int remain_items;
617
618         /* number of items, that will be shifted to left (right) neighbor
619            entirely */
620         to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
621         to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
622         remain_items = vn->vn_nr_item;
623
624         /* how many items remain in S[0] after shiftings to neighbors */
625         remain_items -= (to_left + to_right);
626
627         if (remain_items < 1) {
628                 /* all content of node can be shifted to neighbors */
629                 set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
630                                NULL, -1, -1);
631                 return 1;
632         }
633
634         if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
635                 /* S[0] is not removable */
636                 return 0;
637
638         /* check, whether we can divide 1 remaining item between neighbors */
639
640         /* get size of remaining item (in item units) */
641         size = op_unit_num(&(vn->vn_vi[to_left]));
642
643         if (tb->lbytes + tb->rbytes >= size) {
644                 set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
645                                tb->lbytes, -1);
646                 return 1;
647         }
648
649         return 0;
650 }
651
652 /* check whether L, S, R can be joined in one node */
653 static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
654 {
655         struct virtual_node *vn = tb->tb_vn;
656         int ih_size;
657         struct buffer_head *S0;
658
659         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
660
661         ih_size = 0;
662         if (vn->vn_nr_item) {
663                 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
664                         ih_size += IH_SIZE;
665
666                 if (vn->vn_vi[vn->vn_nr_item - 1].
667                     vi_type & VI_TYPE_RIGHT_MERGEABLE)
668                         ih_size += IH_SIZE;
669         } else {
670                 /* there was only one item and it will be deleted */
671                 struct item_head *ih;
672
673                 RFALSE(B_NR_ITEMS(S0) != 1,
674                        "vs-8125: item number must be 1: it is %d",
675                        B_NR_ITEMS(S0));
676
677                 ih = B_N_PITEM_HEAD(S0, 0);
678                 if (tb->CFR[0]
679                     && !comp_short_le_keys(&(ih->ih_key),
680                                            B_N_PDELIM_KEY(tb->CFR[0],
681                                                           tb->rkey[0])))
682                         if (is_direntry_le_ih(ih)) {
683                                 /* Directory must be in correct state here: that is
684                                    somewhere at the left side should exist first directory
685                                    item. But the item being deleted can not be that first
686                                    one because its right neighbor is item of the same
687                                    directory. (But first item always gets deleted in last
688                                    turn). So, neighbors of deleted item can be merged, so
689                                    we can save ih_size */
690                                 ih_size = IH_SIZE;
691
692                                 /* we might check that left neighbor exists and is of the
693                                    same directory */
694                                 RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
695                                        "vs-8130: first directory item can not be removed until directory is not empty");
696                         }
697
698         }
699
700         if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
701                 set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
702                 PROC_INFO_INC(tb->tb_sb, leaves_removable);
703                 return 1;
704         }
705         return 0;
706
707 }
708
709 /* when we do not split item, lnum and rnum are numbers of entire items */
710 #define SET_PAR_SHIFT_LEFT \
711 if (h)\
712 {\
713    int to_l;\
714    \
715    to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
716               (MAX_NR_KEY(Sh) + 1 - lpar);\
717               \
718               set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
719 }\
720 else \
721 {\
722    if (lset==LEFT_SHIFT_FLOW)\
723      set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
724                      tb->lbytes, -1);\
725    else\
726      set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
727                      -1, -1);\
728 }
729
730 #define SET_PAR_SHIFT_RIGHT \
731 if (h)\
732 {\
733    int to_r;\
734    \
735    to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
736    \
737    set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
738 }\
739 else \
740 {\
741    if (rset==RIGHT_SHIFT_FLOW)\
742      set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
743                   -1, tb->rbytes);\
744    else\
745      set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
746                   -1, -1);\
747 }
748
749 static void free_buffers_in_tb(struct tree_balance *tb)
750 {
751         int i;
752
753         pathrelse(tb->tb_path);
754
755         for (i = 0; i < MAX_HEIGHT; i++) {
756                 brelse(tb->L[i]);
757                 brelse(tb->R[i]);
758                 brelse(tb->FL[i]);
759                 brelse(tb->FR[i]);
760                 brelse(tb->CFL[i]);
761                 brelse(tb->CFR[i]);
762
763                 tb->L[i] = NULL;
764                 tb->R[i] = NULL;
765                 tb->FL[i] = NULL;
766                 tb->FR[i] = NULL;
767                 tb->CFL[i] = NULL;
768                 tb->CFR[i] = NULL;
769         }
770 }
771
772 /* Get new buffers for storing new nodes that are created while balancing.
773  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
774  *              CARRY_ON - schedule didn't occur while the function worked;
775  *              NO_DISK_SPACE - no disk space.
776  */
777 /* The function is NOT SCHEDULE-SAFE! */
778 static int get_empty_nodes(struct tree_balance *tb, int h)
779 {
780         struct buffer_head *new_bh,
781             *Sh = PATH_H_PBUFFER(tb->tb_path, h);
782         b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
783         int counter, number_of_freeblk, amount_needed,  /* number of needed empty blocks */
784          retval = CARRY_ON;
785         struct super_block *sb = tb->tb_sb;
786
787         /* number_of_freeblk is the number of empty blocks which have been
788            acquired for use by the balancing algorithm minus the number of
789            empty blocks used in the previous levels of the analysis,
790            number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
791            after empty blocks are acquired, and the balancing analysis is
792            then restarted, amount_needed is the number needed by this level
793            (h) of the balancing analysis.
794
795            Note that for systems with many processes writing, it would be
796            more layout optimal to calculate the total number needed by all
797            levels and then to run reiserfs_new_blocks to get all of them at once.  */
798
799         /* Initiate number_of_freeblk to the amount acquired prior to the restart of
800            the analysis or 0 if not restarted, then subtract the amount needed
801            by all of the levels of the tree below h. */
802         /* blknum includes S[h], so we subtract 1 in this calculation */
803         for (counter = 0, number_of_freeblk = tb->cur_blknum;
804              counter < h; counter++)
805                 number_of_freeblk -=
806                     (tb->blknum[counter]) ? (tb->blknum[counter] -
807                                                    1) : 0;
808
809         /* Allocate missing empty blocks. */
810         /* if Sh == 0  then we are getting a new root */
811         amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
812         /*  Amount_needed = the amount that we need more than the amount that we have. */
813         if (amount_needed > number_of_freeblk)
814                 amount_needed -= number_of_freeblk;
815         else                    /* If we have enough already then there is nothing to do. */
816                 return CARRY_ON;
817
818         /* No need to check quota - is not allocated for blocks used for formatted nodes */
819         if (reiserfs_new_form_blocknrs(tb, blocknrs,
820                                        amount_needed) == NO_DISK_SPACE)
821                 return NO_DISK_SPACE;
822
823         /* for each blocknumber we just got, get a buffer and stick it on FEB */
824         for (blocknr = blocknrs, counter = 0;
825              counter < amount_needed; blocknr++, counter++) {
826
827                 RFALSE(!*blocknr,
828                        "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
829
830                 new_bh = sb_getblk(sb, *blocknr);
831                 RFALSE(buffer_dirty(new_bh) ||
832                        buffer_journaled(new_bh) ||
833                        buffer_journal_dirty(new_bh),
834                        "PAP-8140: journaled or dirty buffer %b for the new block",
835                        new_bh);
836
837                 /* Put empty buffers into the array. */
838                 RFALSE(tb->FEB[tb->cur_blknum],
839                        "PAP-8141: busy slot for new buffer");
840
841                 set_buffer_journal_new(new_bh);
842                 tb->FEB[tb->cur_blknum++] = new_bh;
843         }
844
845         if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
846                 retval = REPEAT_SEARCH;
847
848         return retval;
849 }
850
851 /* Get free space of the left neighbor, which is stored in the parent
852  * node of the left neighbor.  */
853 static int get_lfree(struct tree_balance *tb, int h)
854 {
855         struct buffer_head *l, *f;
856         int order;
857
858         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
859             (l = tb->FL[h]) == NULL)
860                 return 0;
861
862         if (f == l)
863                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
864         else {
865                 order = B_NR_ITEMS(l);
866                 f = l;
867         }
868
869         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
870 }
871
872 /* Get free space of the right neighbor,
873  * which is stored in the parent node of the right neighbor.
874  */
875 static int get_rfree(struct tree_balance *tb, int h)
876 {
877         struct buffer_head *r, *f;
878         int order;
879
880         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
881             (r = tb->FR[h]) == NULL)
882                 return 0;
883
884         if (f == r)
885                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
886         else {
887                 order = 0;
888                 f = r;
889         }
890
891         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
892
893 }
894
895 /* Check whether left neighbor is in memory. */
896 static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
897 {
898         struct buffer_head *father, *left;
899         struct super_block *sb = tb->tb_sb;
900         b_blocknr_t left_neighbor_blocknr;
901         int left_neighbor_position;
902
903         /* Father of the left neighbor does not exist. */
904         if (!tb->FL[h])
905                 return 0;
906
907         /* Calculate father of the node to be balanced. */
908         father = PATH_H_PBUFFER(tb->tb_path, h + 1);
909
910         RFALSE(!father ||
911                !B_IS_IN_TREE(father) ||
912                !B_IS_IN_TREE(tb->FL[h]) ||
913                !buffer_uptodate(father) ||
914                !buffer_uptodate(tb->FL[h]),
915                "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
916                father, tb->FL[h]);
917
918         /* Get position of the pointer to the left neighbor into the left father. */
919         left_neighbor_position = (father == tb->FL[h]) ?
920             tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
921         /* Get left neighbor block number. */
922         left_neighbor_blocknr =
923             B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
924         /* Look for the left neighbor in the cache. */
925         if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
926
927                 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
928                        "vs-8170: left neighbor (%b %z) is not in the tree",
929                        left, left);
930                 put_bh(left);
931                 return 1;
932         }
933
934         return 0;
935 }
936
937 #define LEFT_PARENTS  'l'
938 #define RIGHT_PARENTS 'r'
939
940 static void decrement_key(struct cpu_key *key)
941 {
942         // call item specific function for this key
943         item_ops[cpu_key_k_type(key)]->decrement_key(key);
944 }
945
946 /* Calculate far left/right parent of the left/right neighbor of the current node, that
947  * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
948  * Calculate left/right common parent of the current node and L[h]/R[h].
949  * Calculate left/right delimiting key position.
950  * Returns:     PATH_INCORRECT   - path in the tree is not correct;
951                 SCHEDULE_OCCURRED - schedule occurred while the function worked;
952  *              CARRY_ON         - schedule didn't occur while the function worked;
953  */
954 static int get_far_parent(struct tree_balance *tb,
955                           int h,
956                           struct buffer_head **pfather,
957                           struct buffer_head **pcom_father, char c_lr_par)
958 {
959         struct buffer_head *parent;
960         INITIALIZE_PATH(s_path_to_neighbor_father);
961         struct treepath *path = tb->tb_path;
962         struct cpu_key s_lr_father_key;
963         int counter,
964             position = INT_MAX,
965             first_last_position = 0,
966             path_offset = PATH_H_PATH_OFFSET(path, h);
967
968         /* Starting from F[h] go upwards in the tree, and look for the common
969            ancestor of F[h], and its neighbor l/r, that should be obtained. */
970
971         counter = path_offset;
972
973         RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
974                "PAP-8180: invalid path length");
975
976         for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
977                 /* Check whether parent of the current buffer in the path is really parent in the tree. */
978                 if (!B_IS_IN_TREE
979                     (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
980                         return REPEAT_SEARCH;
981                 /* Check whether position in the parent is correct. */
982                 if ((position =
983                      PATH_OFFSET_POSITION(path,
984                                           counter - 1)) >
985                     B_NR_ITEMS(parent))
986                         return REPEAT_SEARCH;
987                 /* Check whether parent at the path really points to the child. */
988                 if (B_N_CHILD_NUM(parent, position) !=
989                     PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
990                         return REPEAT_SEARCH;
991                 /* Return delimiting key if position in the parent is not equal to first/last one. */
992                 if (c_lr_par == RIGHT_PARENTS)
993                         first_last_position = B_NR_ITEMS(parent);
994                 if (position != first_last_position) {
995                         *pcom_father = parent;
996                         get_bh(*pcom_father);
997                         /*(*pcom_father = parent)->b_count++; */
998                         break;
999                 }
1000         }
1001
1002         /* if we are in the root of the tree, then there is no common father */
1003         if (counter == FIRST_PATH_ELEMENT_OFFSET) {
1004                 /* Check whether first buffer in the path is the root of the tree. */
1005                 if (PATH_OFFSET_PBUFFER
1006                     (tb->tb_path,
1007                      FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1008                     SB_ROOT_BLOCK(tb->tb_sb)) {
1009                         *pfather = *pcom_father = NULL;
1010                         return CARRY_ON;
1011                 }
1012                 return REPEAT_SEARCH;
1013         }
1014
1015         RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
1016                "PAP-8185: (%b %z) level too small",
1017                *pcom_father, *pcom_father);
1018
1019         /* Check whether the common parent is locked. */
1020
1021         if (buffer_locked(*pcom_father)) {
1022
1023                 /* Release the write lock while the buffer is busy */
1024                 reiserfs_write_unlock(tb->tb_sb);
1025                 __wait_on_buffer(*pcom_father);
1026                 reiserfs_write_lock(tb->tb_sb);
1027                 if (FILESYSTEM_CHANGED_TB(tb)) {
1028                         brelse(*pcom_father);
1029                         return REPEAT_SEARCH;
1030                 }
1031         }
1032
1033         /* So, we got common parent of the current node and its left/right neighbor.
1034            Now we are geting the parent of the left/right neighbor. */
1035
1036         /* Form key to get parent of the left/right neighbor. */
1037         le_key2cpu_key(&s_lr_father_key,
1038                        B_N_PDELIM_KEY(*pcom_father,
1039                                       (c_lr_par ==
1040                                        LEFT_PARENTS) ? (tb->lkey[h - 1] =
1041                                                         position -
1042                                                         1) : (tb->rkey[h -
1043                                                                            1] =
1044                                                               position)));
1045
1046         if (c_lr_par == LEFT_PARENTS)
1047                 decrement_key(&s_lr_father_key);
1048
1049         if (search_by_key
1050             (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1051              h + 1) == IO_ERROR)
1052                 // path is released
1053                 return IO_ERROR;
1054
1055         if (FILESYSTEM_CHANGED_TB(tb)) {
1056                 pathrelse(&s_path_to_neighbor_father);
1057                 brelse(*pcom_father);
1058                 return REPEAT_SEARCH;
1059         }
1060
1061         *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1062
1063         RFALSE(B_LEVEL(*pfather) != h + 1,
1064                "PAP-8190: (%b %z) level too small", *pfather, *pfather);
1065         RFALSE(s_path_to_neighbor_father.path_length <
1066                FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1067
1068         s_path_to_neighbor_father.path_length--;
1069         pathrelse(&s_path_to_neighbor_father);
1070         return CARRY_ON;
1071 }
1072
1073 /* Get parents of neighbors of node in the path(S[path_offset]) and common parents of
1074  * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset],
1075  * FR[path_offset], CFL[path_offset], CFR[path_offset].
1076  * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset].
1077  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1078  *              CARRY_ON - schedule didn't occur while the function worked;
1079  */
1080 static int get_parents(struct tree_balance *tb, int h)
1081 {
1082         struct treepath *path = tb->tb_path;
1083         int position,
1084             ret,
1085             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1086         struct buffer_head *curf, *curcf;
1087
1088         /* Current node is the root of the tree or will be root of the tree */
1089         if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1090                 /* The root can not have parents.
1091                    Release nodes which previously were obtained as parents of the current node neighbors. */
1092                 brelse(tb->FL[h]);
1093                 brelse(tb->CFL[h]);
1094                 brelse(tb->FR[h]);
1095                 brelse(tb->CFR[h]);
1096                 tb->FL[h]  = NULL;
1097                 tb->CFL[h] = NULL;
1098                 tb->FR[h]  = NULL;
1099                 tb->CFR[h] = NULL;
1100                 return CARRY_ON;
1101         }
1102
1103         /* Get parent FL[path_offset] of L[path_offset]. */
1104         position = PATH_OFFSET_POSITION(path, path_offset - 1);
1105         if (position) {
1106                 /* Current node is not the first child of its parent. */
1107                 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1108                 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1109                 get_bh(curf);
1110                 get_bh(curf);
1111                 tb->lkey[h] = position - 1;
1112         } else {
1113                 /* Calculate current parent of L[path_offset], which is the left neighbor of the current node.
1114                    Calculate current common parent of L[path_offset] and the current node. Note that
1115                    CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset].
1116                    Calculate lkey[path_offset]. */
1117                 if ((ret = get_far_parent(tb, h + 1, &curf,
1118                                                   &curcf,
1119                                                   LEFT_PARENTS)) != CARRY_ON)
1120                         return ret;
1121         }
1122
1123         brelse(tb->FL[h]);
1124         tb->FL[h] = curf;       /* New initialization of FL[h]. */
1125         brelse(tb->CFL[h]);
1126         tb->CFL[h] = curcf;     /* New initialization of CFL[h]. */
1127
1128         RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1129                (curcf && !B_IS_IN_TREE(curcf)),
1130                "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
1131
1132 /* Get parent FR[h] of R[h]. */
1133
1134 /* Current node is the last child of F[h]. FR[h] != F[h]. */
1135         if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
1136 /* Calculate current parent of R[h], which is the right neighbor of F[h].
1137    Calculate current common parent of R[h] and current node. Note that CFR[h]
1138    not equal FR[path_offset] and CFR[h] not equal F[h]. */
1139                 if ((ret =
1140                      get_far_parent(tb, h + 1, &curf, &curcf,
1141                                     RIGHT_PARENTS)) != CARRY_ON)
1142                         return ret;
1143         } else {
1144 /* Current node is not the last child of its parent F[h]. */
1145                 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1146                 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1147                 get_bh(curf);
1148                 get_bh(curf);
1149                 tb->rkey[h] = position;
1150         }
1151
1152         brelse(tb->FR[h]);
1153         /* New initialization of FR[path_offset]. */
1154         tb->FR[h] = curf;
1155
1156         brelse(tb->CFR[h]);
1157         /* New initialization of CFR[path_offset]. */
1158         tb->CFR[h] = curcf;
1159
1160         RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1161                (curcf && !B_IS_IN_TREE(curcf)),
1162                "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
1163
1164         return CARRY_ON;
1165 }
1166
1167 /* it is possible to remove node as result of shiftings to
1168    neighbors even when we insert or paste item. */
1169 static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1170                                       struct tree_balance *tb, int h)
1171 {
1172         struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1173         int levbytes = tb->insert_size[h];
1174         struct item_head *ih;
1175         struct reiserfs_key *r_key = NULL;
1176
1177         ih = B_N_PITEM_HEAD(Sh, 0);
1178         if (tb->CFR[h])
1179                 r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
1180
1181         if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1182             /* shifting may merge items which might save space */
1183             -
1184             ((!h
1185               && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
1186             -
1187             ((!h && r_key
1188               && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1189             + ((h) ? KEY_SIZE : 0)) {
1190                 /* node can not be removed */
1191                 if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1192                         if (!h)
1193                                 tb->s0num =
1194                                     B_NR_ITEMS(Sh) +
1195                                     ((mode == M_INSERT) ? 1 : 0);
1196                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1197                         return NO_BALANCING_NEEDED;
1198                 }
1199         }
1200         PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1201         return !NO_BALANCING_NEEDED;
1202 }
1203
1204 /* Check whether current node S[h] is balanced when increasing its size by
1205  * Inserting or Pasting.
1206  * Calculate parameters for balancing for current level h.
1207  * Parameters:
1208  *      tb      tree_balance structure;
1209  *      h       current level of the node;
1210  *      inum    item number in S[h];
1211  *      mode    i - insert, p - paste;
1212  * Returns:     1 - schedule occurred;
1213  *              0 - balancing for higher levels needed;
1214  *             -1 - no balancing for higher levels needed;
1215  *             -2 - no disk space.
1216  */
1217 /* ip means Inserting or Pasting */
1218 static int ip_check_balance(struct tree_balance *tb, int h)
1219 {
1220         struct virtual_node *vn = tb->tb_vn;
1221         int levbytes,           /* Number of bytes that must be inserted into (value
1222                                    is negative if bytes are deleted) buffer which
1223                                    contains node being balanced.  The mnemonic is
1224                                    that the attempted change in node space used level
1225                                    is levbytes bytes. */
1226          ret;
1227
1228         int lfree, sfree, rfree /* free space in L, S and R */ ;
1229
1230         /* nver is short for number of vertixes, and lnver is the number if
1231            we shift to the left, rnver is the number if we shift to the
1232            right, and lrnver is the number if we shift in both directions.
1233            The goal is to minimize first the number of vertixes, and second,
1234            the number of vertixes whose contents are changed by shifting,
1235            and third the number of uncached vertixes whose contents are
1236            changed by shifting and must be read from disk.  */
1237         int nver, lnver, rnver, lrnver;
1238
1239         /* used at leaf level only, S0 = S[0] is the node being balanced,
1240            sInum [ I = 0,1,2 ] is the number of items that will
1241            remain in node SI after balancing.  S1 and S2 are new
1242            nodes that might be created. */
1243
1244         /* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
1245            where 4th parameter is s1bytes and 5th - s2bytes
1246          */
1247         short snum012[40] = { 0, };     /* s0num, s1num, s2num for 8 cases
1248                                            0,1 - do not shift and do not shift but bottle
1249                                            2 - shift only whole item to left
1250                                            3 - shift to left and bottle as much as possible
1251                                            4,5 - shift to right (whole items and as much as possible
1252                                            6,7 - shift to both directions (whole items and as much as possible)
1253                                          */
1254
1255         /* Sh is the node whose balance is currently being checked */
1256         struct buffer_head *Sh;
1257
1258         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1259         levbytes = tb->insert_size[h];
1260
1261         /* Calculate balance parameters for creating new root. */
1262         if (!Sh) {
1263                 if (!h)
1264                         reiserfs_panic(tb->tb_sb, "vs-8210",
1265                                        "S[0] can not be 0");
1266                 switch (ret = get_empty_nodes(tb, h)) {
1267                 case CARRY_ON:
1268                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1269                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1270
1271                 case NO_DISK_SPACE:
1272                 case REPEAT_SEARCH:
1273                         return ret;
1274                 default:
1275                         reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
1276                                        "return value of get_empty_nodes");
1277                 }
1278         }
1279
1280         if ((ret = get_parents(tb, h)) != CARRY_ON)     /* get parents of S[h] neighbors. */
1281                 return ret;
1282
1283         sfree = B_FREE_SPACE(Sh);
1284
1285         /* get free space of neighbors */
1286         rfree = get_rfree(tb, h);
1287         lfree = get_lfree(tb, h);
1288
1289         if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1290             NO_BALANCING_NEEDED)
1291                 /* and new item fits into node S[h] without any shifting */
1292                 return NO_BALANCING_NEEDED;
1293
1294         create_virtual_node(tb, h);
1295
1296         /*
1297            determine maximal number of items we can shift to the left neighbor (in tb structure)
1298            and the maximal number of bytes that can flow to the left neighbor
1299            from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1300          */
1301         check_left(tb, h, lfree);
1302
1303         /*
1304            determine maximal number of items we can shift to the right neighbor (in tb structure)
1305            and the maximal number of bytes that can flow to the right neighbor
1306            from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1307          */
1308         check_right(tb, h, rfree);
1309
1310         /* all contents of internal node S[h] can be moved into its
1311            neighbors, S[h] will be removed after balancing */
1312         if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1313                 int to_r;
1314
1315                 /* Since we are working on internal nodes, and our internal
1316                    nodes have fixed size entries, then we can balance by the
1317                    number of items rather than the space they consume.  In this
1318                    routine we set the left node equal to the right node,
1319                    allowing a difference of less than or equal to 1 child
1320                    pointer. */
1321                 to_r =
1322                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1323                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1324                                                 tb->rnum[h]);
1325                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1326                                -1, -1);
1327                 return CARRY_ON;
1328         }
1329
1330         /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1331         RFALSE(h &&
1332                (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1333                 tb->rnum[h] >= vn->vn_nr_item + 1),
1334                "vs-8220: tree is not balanced on internal level");
1335         RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1336                       (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1337                "vs-8225: tree is not balanced on leaf level");
1338
1339         /* all contents of S[0] can be moved into its neighbors
1340            S[0] will be removed after balancing. */
1341         if (!h && is_leaf_removable(tb))
1342                 return CARRY_ON;
1343
1344         /* why do we perform this check here rather than earlier??
1345            Answer: we can win 1 node in some cases above. Moreover we
1346            checked it above, when we checked, that S[0] is not removable
1347            in principle */
1348         if (sfree >= levbytes) {        /* new item fits into node S[h] without any shifting */
1349                 if (!h)
1350                         tb->s0num = vn->vn_nr_item;
1351                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1352                 return NO_BALANCING_NEEDED;
1353         }
1354
1355         {
1356                 int lpar, rpar, nset, lset, rset, lrset;
1357                 /*
1358                  * regular overflowing of the node
1359                  */
1360
1361                 /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1362                    lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1363                    nset, lset, rset, lrset - shows, whether flowing items give better packing
1364                  */
1365 #define FLOW 1
1366 #define NO_FLOW 0               /* do not any splitting */
1367
1368                 /* we choose one the following */
1369 #define NOTHING_SHIFT_NO_FLOW   0
1370 #define NOTHING_SHIFT_FLOW      5
1371 #define LEFT_SHIFT_NO_FLOW      10
1372 #define LEFT_SHIFT_FLOW         15
1373 #define RIGHT_SHIFT_NO_FLOW     20
1374 #define RIGHT_SHIFT_FLOW        25
1375 #define LR_SHIFT_NO_FLOW        30
1376 #define LR_SHIFT_FLOW           35
1377
1378                 lpar = tb->lnum[h];
1379                 rpar = tb->rnum[h];
1380
1381                 /* calculate number of blocks S[h] must be split into when
1382                    nothing is shifted to the neighbors,
1383                    as well as number of items in each part of the split node (s012 numbers),
1384                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1385                 nset = NOTHING_SHIFT_NO_FLOW;
1386                 nver = get_num_ver(vn->vn_mode, tb, h,
1387                                    0, -1, h ? vn->vn_nr_item : 0, -1,
1388                                    snum012, NO_FLOW);
1389
1390                 if (!h) {
1391                         int nver1;
1392
1393                         /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1394                         nver1 = get_num_ver(vn->vn_mode, tb, h,
1395                                             0, -1, 0, -1,
1396                                             snum012 + NOTHING_SHIFT_FLOW, FLOW);
1397                         if (nver > nver1)
1398                                 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1399                 }
1400
1401                 /* calculate number of blocks S[h] must be split into when
1402                    l_shift_num first items and l_shift_bytes of the right most
1403                    liquid item to be shifted are shifted to the left neighbor,
1404                    as well as number of items in each part of the splitted node (s012 numbers),
1405                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1406                  */
1407                 lset = LEFT_SHIFT_NO_FLOW;
1408                 lnver = get_num_ver(vn->vn_mode, tb, h,
1409                                     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1410                                     -1, h ? vn->vn_nr_item : 0, -1,
1411                                     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1412                 if (!h) {
1413                         int lnver1;
1414
1415                         lnver1 = get_num_ver(vn->vn_mode, tb, h,
1416                                              lpar -
1417                                              ((tb->lbytes != -1) ? 1 : 0),
1418                                              tb->lbytes, 0, -1,
1419                                              snum012 + LEFT_SHIFT_FLOW, FLOW);
1420                         if (lnver > lnver1)
1421                                 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1422                 }
1423
1424                 /* calculate number of blocks S[h] must be split into when
1425                    r_shift_num first items and r_shift_bytes of the left most
1426                    liquid item to be shifted are shifted to the right neighbor,
1427                    as well as number of items in each part of the splitted node (s012 numbers),
1428                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1429                  */
1430                 rset = RIGHT_SHIFT_NO_FLOW;
1431                 rnver = get_num_ver(vn->vn_mode, tb, h,
1432                                     0, -1,
1433                                     h ? (vn->vn_nr_item - rpar) : (rpar -
1434                                                                    ((tb->
1435                                                                      rbytes !=
1436                                                                      -1) ? 1 :
1437                                                                     0)), -1,
1438                                     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1439                 if (!h) {
1440                         int rnver1;
1441
1442                         rnver1 = get_num_ver(vn->vn_mode, tb, h,
1443                                              0, -1,
1444                                              (rpar -
1445                                               ((tb->rbytes != -1) ? 1 : 0)),
1446                                              tb->rbytes,
1447                                              snum012 + RIGHT_SHIFT_FLOW, FLOW);
1448
1449                         if (rnver > rnver1)
1450                                 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1451                 }
1452
1453                 /* calculate number of blocks S[h] must be split into when
1454                    items are shifted in both directions,
1455                    as well as number of items in each part of the splitted node (s012 numbers),
1456                    and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1457                  */
1458                 lrset = LR_SHIFT_NO_FLOW;
1459                 lrnver = get_num_ver(vn->vn_mode, tb, h,
1460                                      lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1461                                      -1,
1462                                      h ? (vn->vn_nr_item - rpar) : (rpar -
1463                                                                     ((tb->
1464                                                                       rbytes !=
1465                                                                       -1) ? 1 :
1466                                                                      0)), -1,
1467                                      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1468                 if (!h) {
1469                         int lrnver1;
1470
1471                         lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1472                                               lpar -
1473                                               ((tb->lbytes != -1) ? 1 : 0),
1474                                               tb->lbytes,
1475                                               (rpar -
1476                                                ((tb->rbytes != -1) ? 1 : 0)),
1477                                               tb->rbytes,
1478                                               snum012 + LR_SHIFT_FLOW, FLOW);
1479                         if (lrnver > lrnver1)
1480                                 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1481                 }
1482
1483                 /* Our general shifting strategy is:
1484                    1) to minimized number of new nodes;
1485                    2) to minimized number of neighbors involved in shifting;
1486                    3) to minimized number of disk reads; */
1487
1488                 /* we can win TWO or ONE nodes by shifting in both directions */
1489                 if (lrnver < lnver && lrnver < rnver) {
1490                         RFALSE(h &&
1491                                (tb->lnum[h] != 1 ||
1492                                 tb->rnum[h] != 1 ||
1493                                 lrnver != 1 || rnver != 2 || lnver != 2
1494                                 || h != 1), "vs-8230: bad h");
1495                         if (lrset == LR_SHIFT_FLOW)
1496                                 set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1497                                                lrnver, snum012 + lrset,
1498                                                tb->lbytes, tb->rbytes);
1499                         else
1500                                 set_parameters(tb, h,
1501                                                tb->lnum[h] -
1502                                                ((tb->lbytes == -1) ? 0 : 1),
1503                                                tb->rnum[h] -
1504                                                ((tb->rbytes == -1) ? 0 : 1),
1505                                                lrnver, snum012 + lrset, -1, -1);
1506
1507                         return CARRY_ON;
1508                 }
1509
1510                 /* if shifting doesn't lead to better packing then don't shift */
1511                 if (nver == lrnver) {
1512                         set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1513                                        -1);
1514                         return CARRY_ON;
1515                 }
1516
1517                 /* now we know that for better packing shifting in only one
1518                    direction either to the left or to the right is required */
1519
1520                 /*  if shifting to the left is better than shifting to the right */
1521                 if (lnver < rnver) {
1522                         SET_PAR_SHIFT_LEFT;
1523                         return CARRY_ON;
1524                 }
1525
1526                 /* if shifting to the right is better than shifting to the left */
1527                 if (lnver > rnver) {
1528                         SET_PAR_SHIFT_RIGHT;
1529                         return CARRY_ON;
1530                 }
1531
1532                 /* now shifting in either direction gives the same number
1533                    of nodes and we can make use of the cached neighbors */
1534                 if (is_left_neighbor_in_cache(tb, h)) {
1535                         SET_PAR_SHIFT_LEFT;
1536                         return CARRY_ON;
1537                 }
1538
1539                 /* shift to the right independently on whether the right neighbor in cache or not */
1540                 SET_PAR_SHIFT_RIGHT;
1541                 return CARRY_ON;
1542         }
1543 }
1544
1545 /* Check whether current node S[h] is balanced when Decreasing its size by
1546  * Deleting or Cutting for INTERNAL node of S+tree.
1547  * Calculate parameters for balancing for current level h.
1548  * Parameters:
1549  *      tb      tree_balance structure;
1550  *      h       current level of the node;
1551  *      inum    item number in S[h];
1552  *      mode    i - insert, p - paste;
1553  * Returns:     1 - schedule occurred;
1554  *              0 - balancing for higher levels needed;
1555  *             -1 - no balancing for higher levels needed;
1556  *             -2 - no disk space.
1557  *
1558  * Note: Items of internal nodes have fixed size, so the balance condition for
1559  * the internal part of S+tree is as for the B-trees.
1560  */
1561 static int dc_check_balance_internal(struct tree_balance *tb, int h)
1562 {
1563         struct virtual_node *vn = tb->tb_vn;
1564
1565         /* Sh is the node whose balance is currently being checked,
1566            and Fh is its father.  */
1567         struct buffer_head *Sh, *Fh;
1568         int maxsize, ret;
1569         int lfree, rfree /* free space in L and R */ ;
1570
1571         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1572         Fh = PATH_H_PPARENT(tb->tb_path, h);
1573
1574         maxsize = MAX_CHILD_SIZE(Sh);
1575
1576 /*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1577 /*   new_nr_item = number of items node would have if operation is */
1578 /*      performed without balancing (new_nr_item); */
1579         create_virtual_node(tb, h);
1580
1581         if (!Fh) {              /* S[h] is the root. */
1582                 if (vn->vn_nr_item > 0) {
1583                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1584                         return NO_BALANCING_NEEDED;     /* no balancing for higher levels needed */
1585                 }
1586                 /* new_nr_item == 0.
1587                  * Current root will be deleted resulting in
1588                  * decrementing the tree height. */
1589                 set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1590                 return CARRY_ON;
1591         }
1592
1593         if ((ret = get_parents(tb, h)) != CARRY_ON)
1594                 return ret;
1595
1596         /* get free space of neighbors */
1597         rfree = get_rfree(tb, h);
1598         lfree = get_lfree(tb, h);
1599
1600         /* determine maximal number of items we can fit into neighbors */
1601         check_left(tb, h, lfree);
1602         check_right(tb, h, rfree);
1603
1604         if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid.
1605                                                  * In this case we balance only if it leads to better packing. */
1606                 if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors,
1607                                                          * which is impossible with greater values of new_nr_item. */
1608                         if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1609                                 /* All contents of S[h] can be moved to L[h]. */
1610                                 int n;
1611                                 int order_L;
1612
1613                                 order_L =
1614                                     ((n =
1615                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1616                                                           h)) ==
1617                                      0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1618                                 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1619                                     (DC_SIZE + KEY_SIZE);
1620                                 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1621                                                -1);
1622                                 return CARRY_ON;
1623                         }
1624
1625                         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1626                                 /* All contents of S[h] can be moved to R[h]. */
1627                                 int n;
1628                                 int order_R;
1629
1630                                 order_R =
1631                                     ((n =
1632                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1633                                                           h)) ==
1634                                      B_NR_ITEMS(Fh)) ? 0 : n + 1;
1635                                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1636                                     (DC_SIZE + KEY_SIZE);
1637                                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1638                                                -1);
1639                                 return CARRY_ON;
1640                         }
1641                 }
1642
1643                 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1644                         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1645                         int to_r;
1646
1647                         to_r =
1648                             ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1649                              tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1650                             (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1651                         set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1652                                        0, NULL, -1, -1);
1653                         return CARRY_ON;
1654                 }
1655
1656                 /* Balancing does not lead to better packing. */
1657                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1658                 return NO_BALANCING_NEEDED;
1659         }
1660
1661         /* Current node contain insufficient number of items. Balancing is required. */
1662         /* Check whether we can merge S[h] with left neighbor. */
1663         if (tb->lnum[h] >= vn->vn_nr_item + 1)
1664                 if (is_left_neighbor_in_cache(tb, h)
1665                     || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1666                         int n;
1667                         int order_L;
1668
1669                         order_L =
1670                             ((n =
1671                               PATH_H_B_ITEM_ORDER(tb->tb_path,
1672                                                   h)) ==
1673                              0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1674                         n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1675                                                                       KEY_SIZE);
1676                         set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1677                         return CARRY_ON;
1678                 }
1679
1680         /* Check whether we can merge S[h] with right neighbor. */
1681         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1682                 int n;
1683                 int order_R;
1684
1685                 order_R =
1686                     ((n =
1687                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1688                                           h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1689                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1690                                                               KEY_SIZE);
1691                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1692                 return CARRY_ON;
1693         }
1694
1695         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1696         if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1697                 int to_r;
1698
1699                 to_r =
1700                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1701                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1702                                                 tb->rnum[h]);
1703                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1704                                -1, -1);
1705                 return CARRY_ON;
1706         }
1707
1708         /* For internal nodes try to borrow item from a neighbor */
1709         RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1710
1711         /* Borrow one or two items from caching neighbor */
1712         if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1713                 int from_l;
1714
1715                 from_l =
1716                     (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1717                      1) / 2 - (vn->vn_nr_item + 1);
1718                 set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1719                 return CARRY_ON;
1720         }
1721
1722         set_parameters(tb, h, 0,
1723                        -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1724                           1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1725         return CARRY_ON;
1726 }
1727
1728 /* Check whether current node S[h] is balanced when Decreasing its size by
1729  * Deleting or Truncating for LEAF node of S+tree.
1730  * Calculate parameters for balancing for current level h.
1731  * Parameters:
1732  *      tb      tree_balance structure;
1733  *      h       current level of the node;
1734  *      inum    item number in S[h];
1735  *      mode    i - insert, p - paste;
1736  * Returns:     1 - schedule occurred;
1737  *              0 - balancing for higher levels needed;
1738  *             -1 - no balancing for higher levels needed;
1739  *             -2 - no disk space.
1740  */
1741 static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1742 {
1743         struct virtual_node *vn = tb->tb_vn;
1744
1745         /* Number of bytes that must be deleted from
1746            (value is negative if bytes are deleted) buffer which
1747            contains node being balanced.  The mnemonic is that the
1748            attempted change in node space used level is levbytes bytes. */
1749         int levbytes;
1750         /* the maximal item size */
1751         int maxsize, ret;
1752         /* S0 is the node whose balance is currently being checked,
1753            and F0 is its father.  */
1754         struct buffer_head *S0, *F0;
1755         int lfree, rfree /* free space in L and R */ ;
1756
1757         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1758         F0 = PATH_H_PPARENT(tb->tb_path, 0);
1759
1760         levbytes = tb->insert_size[h];
1761
1762         maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
1763
1764         if (!F0) {              /* S[0] is the root now. */
1765
1766                 RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1767                        "vs-8240: attempt to create empty buffer tree");
1768
1769                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1770                 return NO_BALANCING_NEEDED;
1771         }
1772
1773         if ((ret = get_parents(tb, h)) != CARRY_ON)
1774                 return ret;
1775
1776         /* get free space of neighbors */
1777         rfree = get_rfree(tb, h);
1778         lfree = get_lfree(tb, h);
1779
1780         create_virtual_node(tb, h);
1781
1782         /* if 3 leaves can be merge to one, set parameters and return */
1783         if (are_leaves_removable(tb, lfree, rfree))
1784                 return CARRY_ON;
1785
1786         /* determine maximal number of items we can shift to the left/right  neighbor
1787            and the maximal number of bytes that can flow to the left/right neighbor
1788            from the left/right most liquid item that cannot be shifted from S[0] entirely
1789          */
1790         check_left(tb, h, lfree);
1791         check_right(tb, h, rfree);
1792
1793         /* check whether we can merge S with left neighbor. */
1794         if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1795                 if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||      /* S can not be merged with R */
1796                     !tb->FR[h]) {
1797
1798                         RFALSE(!tb->FL[h],
1799                                "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1800
1801                         /* set parameter to merge S[0] with its left neighbor */
1802                         set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1803                         return CARRY_ON;
1804                 }
1805
1806         /* check whether we can merge S[0] with right neighbor. */
1807         if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1808                 set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
1809                 return CARRY_ON;
1810         }
1811
1812         /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1813         if (is_leaf_removable(tb))
1814                 return CARRY_ON;
1815
1816         /* Balancing is not required. */
1817         tb->s0num = vn->vn_nr_item;
1818         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1819         return NO_BALANCING_NEEDED;
1820 }
1821
1822 /* Check whether current node S[h] is balanced when Decreasing its size by
1823  * Deleting or Cutting.
1824  * Calculate parameters for balancing for current level h.
1825  * Parameters:
1826  *      tb      tree_balance structure;
1827  *      h       current level of the node;
1828  *      inum    item number in S[h];
1829  *      mode    d - delete, c - cut.
1830  * Returns:     1 - schedule occurred;
1831  *              0 - balancing for higher levels needed;
1832  *             -1 - no balancing for higher levels needed;
1833  *             -2 - no disk space.
1834  */
1835 static int dc_check_balance(struct tree_balance *tb, int h)
1836 {
1837         RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
1838                "vs-8250: S is not initialized");
1839
1840         if (h)
1841                 return dc_check_balance_internal(tb, h);
1842         else
1843                 return dc_check_balance_leaf(tb, h);
1844 }
1845
1846 /* Check whether current node S[h] is balanced.
1847  * Calculate parameters for balancing for current level h.
1848  * Parameters:
1849  *
1850  *      tb      tree_balance structure:
1851  *
1852  *              tb is a large structure that must be read about in the header file
1853  *              at the same time as this procedure if the reader is to successfully
1854  *              understand this procedure
1855  *
1856  *      h       current level of the node;
1857  *      inum    item number in S[h];
1858  *      mode    i - insert, p - paste, d - delete, c - cut.
1859  * Returns:     1 - schedule occurred;
1860  *              0 - balancing for higher levels needed;
1861  *             -1 - no balancing for higher levels needed;
1862  *             -2 - no disk space.
1863  */
1864 static int check_balance(int mode,
1865                          struct tree_balance *tb,
1866                          int h,
1867                          int inum,
1868                          int pos_in_item,
1869                          struct item_head *ins_ih, const void *data)
1870 {
1871         struct virtual_node *vn;
1872
1873         vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1874         vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1875         vn->vn_mode = mode;
1876         vn->vn_affected_item_num = inum;
1877         vn->vn_pos_in_item = pos_in_item;
1878         vn->vn_ins_ih = ins_ih;
1879         vn->vn_data = data;
1880
1881         RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
1882                "vs-8255: ins_ih can not be 0 in insert mode");
1883
1884         if (tb->insert_size[h] > 0)
1885                 /* Calculate balance parameters when size of node is increasing. */
1886                 return ip_check_balance(tb, h);
1887
1888         /* Calculate balance parameters when  size of node is decreasing. */
1889         return dc_check_balance(tb, h);
1890 }
1891
1892 /* Check whether parent at the path is the really parent of the current node.*/
1893 static int get_direct_parent(struct tree_balance *tb, int h)
1894 {
1895         struct buffer_head *bh;
1896         struct treepath *path = tb->tb_path;
1897         int position,
1898             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1899
1900         /* We are in the root or in the new root. */
1901         if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1902
1903                 RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1904                        "PAP-8260: invalid offset in the path");
1905
1906                 if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
1907                     b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
1908                         /* Root is not changed. */
1909                         PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
1910                         PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
1911                         return CARRY_ON;
1912                 }
1913                 return REPEAT_SEARCH;   /* Root is changed and we must recalculate the path. */
1914         }
1915
1916         if (!B_IS_IN_TREE
1917             (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
1918                 return REPEAT_SEARCH;   /* Parent in the path is not in the tree. */
1919
1920         if ((position =
1921              PATH_OFFSET_POSITION(path,
1922                                   path_offset - 1)) > B_NR_ITEMS(bh))
1923                 return REPEAT_SEARCH;
1924
1925         if (B_N_CHILD_NUM(bh, position) !=
1926             PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
1927                 /* Parent in the path is not parent of the current node in the tree. */
1928                 return REPEAT_SEARCH;
1929
1930         if (buffer_locked(bh)) {
1931                 reiserfs_write_unlock(tb->tb_sb);
1932                 __wait_on_buffer(bh);
1933                 reiserfs_write_lock(tb->tb_sb);
1934                 if (FILESYSTEM_CHANGED_TB(tb))
1935                         return REPEAT_SEARCH;
1936         }
1937
1938         return CARRY_ON;        /* Parent in the path is unlocked and really parent of the current node.  */
1939 }
1940
1941 /* Using lnum[h] and rnum[h] we should determine what neighbors
1942  * of S[h] we
1943  * need in order to balance S[h], and get them if necessary.
1944  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
1945  *              CARRY_ON - schedule didn't occur while the function worked;
1946  */
1947 static int get_neighbors(struct tree_balance *tb, int h)
1948 {
1949         int child_position,
1950             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
1951         unsigned long son_number;
1952         struct super_block *sb = tb->tb_sb;
1953         struct buffer_head *bh;
1954
1955         PROC_INFO_INC(sb, get_neighbors[h]);
1956
1957         if (tb->lnum[h]) {
1958                 /* We need left neighbor to balance S[h]. */
1959                 PROC_INFO_INC(sb, need_l_neighbor[h]);
1960                 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
1961
1962                 RFALSE(bh == tb->FL[h] &&
1963                        !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
1964                        "PAP-8270: invalid position in the parent");
1965
1966                 child_position =
1967                     (bh ==
1968                      tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
1969                                                                        FL[h]);
1970                 son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
1971                 reiserfs_write_unlock(sb);
1972                 bh = sb_bread(sb, son_number);
1973                 reiserfs_write_lock(sb);
1974                 if (!bh)
1975                         return IO_ERROR;
1976                 if (FILESYSTEM_CHANGED_TB(tb)) {
1977                         brelse(bh);
1978                         PROC_INFO_INC(sb, get_neighbors_restart[h]);
1979                         return REPEAT_SEARCH;
1980                 }
1981
1982                 RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
1983                        child_position > B_NR_ITEMS(tb->FL[h]) ||
1984                        B_N_CHILD_NUM(tb->FL[h], child_position) !=
1985                        bh->b_blocknr, "PAP-8275: invalid parent");
1986                 RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
1987                 RFALSE(!h &&
1988                        B_FREE_SPACE(bh) !=
1989                        MAX_CHILD_SIZE(bh) -
1990                        dc_size(B_N_CHILD(tb->FL[0], child_position)),
1991                        "PAP-8290: invalid child size of left neighbor");
1992
1993                 brelse(tb->L[h]);
1994                 tb->L[h] = bh;
1995         }
1996
1997         /* We need right neighbor to balance S[path_offset]. */
1998         if (tb->rnum[h]) {      /* We need right neighbor to balance S[path_offset]. */
1999                 PROC_INFO_INC(sb, need_r_neighbor[h]);
2000                 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
2001
2002                 RFALSE(bh == tb->FR[h] &&
2003                        PATH_OFFSET_POSITION(tb->tb_path,
2004                                             path_offset) >=
2005                        B_NR_ITEMS(bh),
2006                        "PAP-8295: invalid position in the parent");
2007
2008                 child_position =
2009                     (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
2010                 son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2011                 reiserfs_write_unlock(sb);
2012                 bh = sb_bread(sb, son_number);
2013                 reiserfs_write_lock(sb);
2014                 if (!bh)
2015                         return IO_ERROR;
2016                 if (FILESYSTEM_CHANGED_TB(tb)) {
2017                         brelse(bh);
2018                         PROC_INFO_INC(sb, get_neighbors_restart[h]);
2019                         return REPEAT_SEARCH;
2020                 }
2021                 brelse(tb->R[h]);
2022                 tb->R[h] = bh;
2023
2024                 RFALSE(!h
2025                        && B_FREE_SPACE(bh) !=
2026                        MAX_CHILD_SIZE(bh) -
2027                        dc_size(B_N_CHILD(tb->FR[0], child_position)),
2028                        "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2029                        B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
2030                        dc_size(B_N_CHILD(tb->FR[0], child_position)));
2031
2032         }
2033         return CARRY_ON;
2034 }
2035
2036 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2037 {
2038         int max_num_of_items;
2039         int max_num_of_entries;
2040         unsigned long blocksize = sb->s_blocksize;
2041
2042 #define MIN_NAME_LEN 1
2043
2044         max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2045         max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2046             (DEH_SIZE + MIN_NAME_LEN);
2047
2048         return sizeof(struct virtual_node) +
2049             max(max_num_of_items * sizeof(struct virtual_item),
2050                 sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2051                 (max_num_of_entries - 1) * sizeof(__u16));
2052 }
2053
2054 /* maybe we should fail balancing we are going to perform when kmalloc
2055    fails several times. But now it will loop until kmalloc gets
2056    required memory */
2057 static int get_mem_for_virtual_node(struct tree_balance *tb)
2058 {
2059         int check_fs = 0;
2060         int size;
2061         char *buf;
2062
2063         size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2064
2065         if (size > tb->vn_buf_size) {
2066                 /* we have to allocate more memory for virtual node */
2067                 if (tb->vn_buf) {
2068                         /* free memory allocated before */
2069                         kfree(tb->vn_buf);
2070                         /* this is not needed if kfree is atomic */
2071                         check_fs = 1;
2072                 }
2073
2074                 /* virtual node requires now more memory */
2075                 tb->vn_buf_size = size;
2076
2077                 /* get memory for virtual item */
2078                 buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2079                 if (!buf) {
2080                         /* getting memory with GFP_KERNEL priority may involve
2081                            balancing now (due to indirect_to_direct conversion on
2082                            dcache shrinking). So, release path and collected
2083                            resources here */
2084                         free_buffers_in_tb(tb);
2085                         buf = kmalloc(size, GFP_NOFS);
2086                         if (!buf) {
2087                                 tb->vn_buf_size = 0;
2088                         }
2089                         tb->vn_buf = buf;
2090                         schedule();
2091                         return REPEAT_SEARCH;
2092                 }
2093
2094                 tb->vn_buf = buf;
2095         }
2096
2097         if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2098                 return REPEAT_SEARCH;
2099
2100         return CARRY_ON;
2101 }
2102
2103 #ifdef CONFIG_REISERFS_CHECK
2104 static void tb_buffer_sanity_check(struct super_block *sb,
2105                                    struct buffer_head *bh,
2106                                    const char *descr, int level)
2107 {
2108         if (bh) {
2109                 if (atomic_read(&(bh->b_count)) <= 0)
2110
2111                         reiserfs_panic(sb, "jmacd-1", "negative or zero "
2112                                        "reference counter for buffer %s[%d] "
2113                                        "(%b)", descr, level, bh);
2114
2115                 if (!buffer_uptodate(bh))
2116                         reiserfs_panic(sb, "jmacd-2", "buffer is not up "
2117                                        "to date %s[%d] (%b)",
2118                                        descr, level, bh);
2119
2120                 if (!B_IS_IN_TREE(bh))
2121                         reiserfs_panic(sb, "jmacd-3", "buffer is not "
2122                                        "in tree %s[%d] (%b)",
2123                                        descr, level, bh);
2124
2125                 if (bh->b_bdev != sb->s_bdev)
2126                         reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
2127                                        "device %s[%d] (%b)",
2128                                        descr, level, bh);
2129
2130                 if (bh->b_size != sb->s_blocksize)
2131                         reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
2132                                        "blocksize %s[%d] (%b)",
2133                                        descr, level, bh);
2134
2135                 if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
2136                         reiserfs_panic(sb, "jmacd-6", "buffer block "
2137                                        "number too high %s[%d] (%b)",
2138                                        descr, level, bh);
2139         }
2140 }
2141 #else
2142 static void tb_buffer_sanity_check(struct super_block *sb,
2143                                    struct buffer_head *bh,
2144                                    const char *descr, int level)
2145 {;
2146 }
2147 #endif
2148
2149 static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2150 {
2151         return reiserfs_prepare_for_journal(s, bh, 0);
2152 }
2153
2154 static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2155 {
2156         struct buffer_head *locked;
2157 #ifdef CONFIG_REISERFS_CHECK
2158         int repeat_counter = 0;
2159 #endif
2160         int i;
2161
2162         do {
2163
2164                 locked = NULL;
2165
2166                 for (i = tb->tb_path->path_length;
2167                      !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2168                         if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
2169                                 /* if I understand correctly, we can only be sure the last buffer
2170                                  ** in the path is in the tree --clm
2171                                  */
2172 #ifdef CONFIG_REISERFS_CHECK
2173                                 if (PATH_PLAST_BUFFER(tb->tb_path) ==
2174                                     PATH_OFFSET_PBUFFER(tb->tb_path, i))
2175                                         tb_buffer_sanity_check(tb->tb_sb,
2176                                                                PATH_OFFSET_PBUFFER
2177                                                                (tb->tb_path,
2178                                                                 i), "S",
2179                                                                tb->tb_path->
2180                                                                path_length - i);
2181 #endif
2182                                 if (!clear_all_dirty_bits(tb->tb_sb,
2183                                                           PATH_OFFSET_PBUFFER
2184                                                           (tb->tb_path,
2185                                                            i))) {
2186                                         locked =
2187                                             PATH_OFFSET_PBUFFER(tb->tb_path,
2188                                                                 i);
2189                                 }
2190                         }
2191                 }
2192
2193                 for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
2194                      i++) {
2195
2196                         if (tb->lnum[i]) {
2197
2198                                 if (tb->L[i]) {
2199                                         tb_buffer_sanity_check(tb->tb_sb,
2200                                                                tb->L[i],
2201                                                                "L", i);
2202                                         if (!clear_all_dirty_bits
2203                                             (tb->tb_sb, tb->L[i]))
2204                                                 locked = tb->L[i];
2205                                 }
2206
2207                                 if (!locked && tb->FL[i]) {
2208                                         tb_buffer_sanity_check(tb->tb_sb,
2209                                                                tb->FL[i],
2210                                                                "FL", i);
2211                                         if (!clear_all_dirty_bits
2212                                             (tb->tb_sb, tb->FL[i]))
2213                                                 locked = tb->FL[i];
2214                                 }
2215
2216                                 if (!locked && tb->CFL[i]) {
2217                                         tb_buffer_sanity_check(tb->tb_sb,
2218                                                                tb->CFL[i],
2219                                                                "CFL", i);
2220                                         if (!clear_all_dirty_bits
2221                                             (tb->tb_sb, tb->CFL[i]))
2222                                                 locked = tb->CFL[i];
2223                                 }
2224
2225                         }
2226
2227                         if (!locked && (tb->rnum[i])) {
2228
2229                                 if (tb->R[i]) {
2230                                         tb_buffer_sanity_check(tb->tb_sb,
2231                                                                tb->R[i],
2232                                                                "R", i);
2233                                         if (!clear_all_dirty_bits
2234                                             (tb->tb_sb, tb->R[i]))
2235                                                 locked = tb->R[i];
2236                                 }
2237
2238                                 if (!locked && tb->FR[i]) {
2239                                         tb_buffer_sanity_check(tb->tb_sb,
2240                                                                tb->FR[i],
2241                                                                "FR", i);
2242                                         if (!clear_all_dirty_bits
2243                                             (tb->tb_sb, tb->FR[i]))
2244                                                 locked = tb->FR[i];
2245                                 }
2246
2247                                 if (!locked && tb->CFR[i]) {
2248                                         tb_buffer_sanity_check(tb->tb_sb,
2249                                                                tb->CFR[i],
2250                                                                "CFR", i);
2251                                         if (!clear_all_dirty_bits
2252                                             (tb->tb_sb, tb->CFR[i]))
2253                                                 locked = tb->CFR[i];
2254                                 }
2255                         }
2256                 }
2257                 /* as far as I can tell, this is not required.  The FEB list seems
2258                  ** to be full of newly allocated nodes, which will never be locked,
2259                  ** dirty, or anything else.
2260                  ** To be safe, I'm putting in the checks and waits in.  For the moment,
2261                  ** they are needed to keep the code in journal.c from complaining
2262                  ** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
2263                  ** --clm
2264                  */
2265                 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2266                         if (tb->FEB[i]) {
2267                                 if (!clear_all_dirty_bits
2268                                     (tb->tb_sb, tb->FEB[i]))
2269                                         locked = tb->FEB[i];
2270                         }
2271                 }
2272
2273                 if (locked) {
2274 #ifdef CONFIG_REISERFS_CHECK
2275                         repeat_counter++;
2276                         if ((repeat_counter % 10000) == 0) {
2277                                 reiserfs_warning(tb->tb_sb, "reiserfs-8200",
2278                                                  "too many iterations waiting "
2279                                                  "for buffer to unlock "
2280                                                  "(%b)", locked);
2281
2282                                 /* Don't loop forever.  Try to recover from possible error. */
2283
2284                                 return (FILESYSTEM_CHANGED_TB(tb)) ?
2285                                     REPEAT_SEARCH : CARRY_ON;
2286                         }
2287 #endif
2288                         reiserfs_write_unlock(tb->tb_sb);
2289                         __wait_on_buffer(locked);
2290                         reiserfs_write_lock(tb->tb_sb);
2291                         if (FILESYSTEM_CHANGED_TB(tb))
2292                                 return REPEAT_SEARCH;
2293                 }
2294
2295         } while (locked);
2296
2297         return CARRY_ON;
2298 }
2299
2300 /* Prepare for balancing, that is
2301  *      get all necessary parents, and neighbors;
2302  *      analyze what and where should be moved;
2303  *      get sufficient number of new nodes;
2304  * Balancing will start only after all resources will be collected at a time.
2305  *
2306  * When ported to SMP kernels, only at the last moment after all needed nodes
2307  * are collected in cache, will the resources be locked using the usual
2308  * textbook ordered lock acquisition algorithms.  Note that ensuring that
2309  * this code neither write locks what it does not need to write lock nor locks out of order
2310  * will be a pain in the butt that could have been avoided.  Grumble grumble. -Hans
2311  *
2312  * fix is meant in the sense of render unchanging
2313  *
2314  * Latency might be improved by first gathering a list of what buffers are needed
2315  * and then getting as many of them in parallel as possible? -Hans
2316  *
2317  * Parameters:
2318  *      op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2319  *      tb      tree_balance structure;
2320  *      inum    item number in S[h];
2321  *      pos_in_item - comment this if you can
2322  *      ins_ih  item head of item being inserted
2323  *      data    inserted item or data to be pasted
2324  * Returns:     1 - schedule occurred while the function worked;
2325  *              0 - schedule didn't occur while the function worked;
2326  *             -1 - if no_disk_space
2327  */
2328
2329 int fix_nodes(int op_mode, struct tree_balance *tb,
2330               struct item_head *ins_ih, const void *data)
2331 {
2332         int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
2333         int pos_in_item;
2334
2335         /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2336          ** during wait_tb_buffers_run
2337          */
2338         int wait_tb_buffers_run = 0;
2339         struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2340
2341         ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
2342
2343         pos_in_item = tb->tb_path->pos_in_item;
2344
2345         tb->fs_gen = get_generation(tb->tb_sb);
2346
2347         /* we prepare and log the super here so it will already be in the
2348          ** transaction when do_balance needs to change it.
2349          ** This way do_balance won't have to schedule when trying to prepare
2350          ** the super for logging
2351          */
2352         reiserfs_prepare_for_journal(tb->tb_sb,
2353                                      SB_BUFFER_WITH_SB(tb->tb_sb), 1);
2354         journal_mark_dirty(tb->transaction_handle, tb->tb_sb,
2355                            SB_BUFFER_WITH_SB(tb->tb_sb));
2356         if (FILESYSTEM_CHANGED_TB(tb))
2357                 return REPEAT_SEARCH;
2358
2359         /* if it possible in indirect_to_direct conversion */
2360         if (buffer_locked(tbS0)) {
2361                 reiserfs_write_unlock(tb->tb_sb);
2362                 __wait_on_buffer(tbS0);
2363                 reiserfs_write_lock(tb->tb_sb);
2364                 if (FILESYSTEM_CHANGED_TB(tb))
2365                         return REPEAT_SEARCH;
2366         }
2367 #ifdef CONFIG_REISERFS_CHECK
2368         if (REISERFS_SB(tb->tb_sb)->cur_tb) {
2369                 print_cur_tb("fix_nodes");
2370                 reiserfs_panic(tb->tb_sb, "PAP-8305",
2371                                "there is pending do_balance");
2372         }
2373
2374         if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
2375                 reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
2376                                "not uptodate at the beginning of fix_nodes "
2377                                "or not in tree (mode %c)",
2378                                tbS0, tbS0, op_mode);
2379
2380         /* Check parameters. */
2381         switch (op_mode) {
2382         case M_INSERT:
2383                 if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
2384                         reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
2385                                        "item number %d (in S0 - %d) in case "
2386                                        "of insert", item_num,
2387                                        B_NR_ITEMS(tbS0));
2388                 break;
2389         case M_PASTE:
2390         case M_DELETE:
2391         case M_CUT:
2392                 if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
2393                         print_block(tbS0, 0, -1, -1);
2394                         reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
2395                                        "item number(%d); mode = %c "
2396                                        "insert_size = %d",
2397                                        item_num, op_mode,
2398                                        tb->insert_size[0]);
2399                 }
2400                 break;
2401         default:
2402                 reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
2403                                "of operation");
2404         }
2405 #endif
2406
2407         if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
2408                 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2409                 return REPEAT_SEARCH;
2410
2411         /* Starting from the leaf level; for all levels h of the tree. */
2412         for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
2413                 ret = get_direct_parent(tb, h);
2414                 if (ret != CARRY_ON)
2415                         goto repeat;
2416
2417                 ret = check_balance(op_mode, tb, h, item_num,
2418                                     pos_in_item, ins_ih, data);
2419                 if (ret != CARRY_ON) {
2420                         if (ret == NO_BALANCING_NEEDED) {
2421                                 /* No balancing for higher levels needed. */
2422                                 ret = get_neighbors(tb, h);
2423                                 if (ret != CARRY_ON)
2424                                         goto repeat;
2425                                 if (h != MAX_HEIGHT - 1)
2426                                         tb->insert_size[h + 1] = 0;
2427                                 /* ok, analysis and resource gathering are complete */
2428                                 break;
2429                         }
2430                         goto repeat;
2431                 }
2432
2433                 ret = get_neighbors(tb, h);
2434                 if (ret != CARRY_ON)
2435                         goto repeat;
2436
2437                 /* No disk space, or schedule occurred and analysis may be
2438                  * invalid and needs to be redone. */
2439                 ret = get_empty_nodes(tb, h);
2440                 if (ret != CARRY_ON)
2441                         goto repeat;
2442
2443                 if (!PATH_H_PBUFFER(tb->tb_path, h)) {
2444                         /* We have a positive insert size but no nodes exist on this
2445                            level, this means that we are creating a new root. */
2446
2447                         RFALSE(tb->blknum[h] != 1,
2448                                "PAP-8350: creating new empty root");
2449
2450                         if (h < MAX_HEIGHT - 1)
2451                                 tb->insert_size[h + 1] = 0;
2452                 } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
2453                         if (tb->blknum[h] > 1) {
2454                                 /* The tree needs to be grown, so this node S[h]
2455                                    which is the root node is split into two nodes,
2456                                    and a new node (S[h+1]) will be created to
2457                                    become the root node.  */
2458
2459                                 RFALSE(h == MAX_HEIGHT - 1,
2460                                        "PAP-8355: attempt to create too high of a tree");
2461
2462                                 tb->insert_size[h + 1] =
2463                                     (DC_SIZE +
2464                                      KEY_SIZE) * (tb->blknum[h] - 1) +
2465                                     DC_SIZE;
2466                         } else if (h < MAX_HEIGHT - 1)
2467                                 tb->insert_size[h + 1] = 0;
2468                 } else
2469                         tb->insert_size[h + 1] =
2470                             (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
2471         }
2472
2473         ret = wait_tb_buffers_until_unlocked(tb);
2474         if (ret == CARRY_ON) {
2475                 if (FILESYSTEM_CHANGED_TB(tb)) {
2476                         wait_tb_buffers_run = 1;
2477                         ret = REPEAT_SEARCH;
2478                         goto repeat;
2479                 } else {
2480                         return CARRY_ON;
2481                 }
2482         } else {
2483                 wait_tb_buffers_run = 1;
2484                 goto repeat;
2485         }
2486
2487       repeat:
2488         // fix_nodes was unable to perform its calculation due to
2489         // filesystem got changed under us, lack of free disk space or i/o
2490         // failure. If the first is the case - the search will be
2491         // repeated. For now - free all resources acquired so far except
2492         // for the new allocated nodes
2493         {
2494                 int i;
2495
2496                 /* Release path buffers. */
2497                 if (wait_tb_buffers_run) {
2498                         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2499                 } else {
2500                         pathrelse(tb->tb_path);
2501                 }
2502                 /* brelse all resources collected for balancing */
2503                 for (i = 0; i < MAX_HEIGHT; i++) {
2504                         if (wait_tb_buffers_run) {
2505                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2506                                                                  tb->L[i]);
2507                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2508                                                                  tb->R[i]);
2509                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2510                                                                  tb->FL[i]);
2511                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2512                                                                  tb->FR[i]);
2513                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2514                                                                  tb->
2515                                                                  CFL[i]);
2516                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2517                                                                  tb->
2518                                                                  CFR[i]);
2519                         }
2520
2521                         brelse(tb->L[i]);
2522                         brelse(tb->R[i]);
2523                         brelse(tb->FL[i]);
2524                         brelse(tb->FR[i]);
2525                         brelse(tb->CFL[i]);
2526                         brelse(tb->CFR[i]);
2527
2528                         tb->L[i] = NULL;
2529                         tb->R[i] = NULL;
2530                         tb->FL[i] = NULL;
2531                         tb->FR[i] = NULL;
2532                         tb->CFL[i] = NULL;
2533                         tb->CFR[i] = NULL;
2534                 }
2535
2536                 if (wait_tb_buffers_run) {
2537                         for (i = 0; i < MAX_FEB_SIZE; i++) {
2538                                 if (tb->FEB[i])
2539                                         reiserfs_restore_prepared_buffer
2540                                             (tb->tb_sb, tb->FEB[i]);
2541                         }
2542                 }
2543                 return ret;
2544         }
2545
2546 }
2547
2548 /* Anatoly will probably forgive me renaming tb to tb. I just
2549    wanted to make lines shorter */
2550 void unfix_nodes(struct tree_balance *tb)
2551 {
2552         int i;
2553
2554         /* Release path buffers. */
2555         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2556
2557         /* brelse all resources collected for balancing */
2558         for (i = 0; i < MAX_HEIGHT; i++) {
2559                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2560                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2561                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2562                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2563                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2564                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2565
2566                 brelse(tb->L[i]);
2567                 brelse(tb->R[i]);
2568                 brelse(tb->FL[i]);
2569                 brelse(tb->FR[i]);
2570                 brelse(tb->CFL[i]);
2571                 brelse(tb->CFR[i]);
2572         }
2573
2574         /* deal with list of allocated (used and unused) nodes */
2575         for (i = 0; i < MAX_FEB_SIZE; i++) {
2576                 if (tb->FEB[i]) {
2577                         b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2578                         /* de-allocated block which was not used by balancing and
2579                            bforget about buffer for it */
2580                         brelse(tb->FEB[i]);
2581                         reiserfs_free_block(tb->transaction_handle, NULL,
2582                                             blocknr, 0);
2583                 }
2584                 if (tb->used[i]) {
2585                         /* release used as new nodes including a new root */
2586                         brelse(tb->used[i]);
2587                 }
2588         }
2589
2590         kfree(tb->vn_buf);
2591
2592 }