update to 2.6.9-rc1
[linux-flexiantxendom0-3.2.10.git] / fs / xfs / xfs_vnodeops.c
1 /*
2  * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms of version 2 of the GNU General Public License as
6  * published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it would be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11  *
12  * Further, this software is distributed without any warranty that it is
13  * free of the rightful claim of any third person regarding infringement
14  * or the like.  Any license provided herein, whether implied or
15  * otherwise, applies only to this software file.  Patent licenses, if
16  * any, provided herein do not apply to combinations of this program with
17  * other software, or any other product whatsoever.
18  *
19  * You should have received a copy of the GNU General Public License along
20  * with this program; if not, write the Free Software Foundation, Inc., 59
21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22  *
23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24  * Mountain View, CA  94043, or:
25  *
26  * http://www.sgi.com
27  *
28  * For further information regarding this notice, see:
29  *
30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31  */
32
33 #include "xfs.h"
34 #include "xfs_macros.h"
35 #include "xfs_types.h"
36 #include "xfs_inum.h"
37 #include "xfs_log.h"
38 #include "xfs_trans.h"
39 #include "xfs_sb.h"
40 #include "xfs_ag.h"
41 #include "xfs_dir.h"
42 #include "xfs_dir2.h"
43 #include "xfs_dmapi.h"
44 #include "xfs_mount.h"
45 #include "xfs_alloc_btree.h"
46 #include "xfs_bmap_btree.h"
47 #include "xfs_ialloc_btree.h"
48 #include "xfs_itable.h"
49 #include "xfs_btree.h"
50 #include "xfs_ialloc.h"
51 #include "xfs_alloc.h"
52 #include "xfs_attr_sf.h"
53 #include "xfs_dir_sf.h"
54 #include "xfs_dir2_sf.h"
55 #include "xfs_dinode.h"
56 #include "xfs_inode_item.h"
57 #include "xfs_inode.h"
58 #include "xfs_bmap.h"
59 #include "xfs_da_btree.h"
60 #include "xfs_attr.h"
61 #include "xfs_rw.h"
62 #include "xfs_refcache.h"
63 #include "xfs_error.h"
64 #include "xfs_bit.h"
65 #include "xfs_rtalloc.h"
66 #include "xfs_quota.h"
67 #include "xfs_utils.h"
68 #include "xfs_trans_space.h"
69 #include "xfs_dir_leaf.h"
70 #include "xfs_mac.h"
71 #include "xfs_log_priv.h"
72
73
74 /*
75  * The maximum pathlen is 1024 bytes. Since the minimum file system
76  * blocksize is 512 bytes, we can get a max of 2 extents back from
77  * bmapi.
78  */
79 #define SYMLINK_MAPS 2
80
81 /*
82  * For xfs, we check that the file isn't too big to be opened by this kernel.
83  * No other open action is required for regular files.  Devices are handled
84  * through the specfs file system, pipes through fifofs.  Device and
85  * fifo vnodes are "wrapped" by specfs and fifofs vnodes, respectively,
86  * when a new vnode is first looked up or created.
87  */
88 STATIC int
89 xfs_open(
90         bhv_desc_t      *bdp,
91         cred_t          *credp)
92 {
93         int             mode;
94         vnode_t         *vp;
95         xfs_inode_t     *ip;
96
97         vp = BHV_TO_VNODE(bdp);
98         ip = XFS_BHVTOI(bdp);
99
100         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
101                 return XFS_ERROR(EIO);
102
103         /*
104          * If it's a directory with any blocks, read-ahead block 0
105          * as we're almost certain to have the next operation be a read there.
106          */
107         if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) {
108                 mode = xfs_ilock_map_shared(ip);
109                 if (ip->i_d.di_nextents > 0)
110                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
111                 xfs_iunlock(ip, mode);
112         }
113         return 0;
114 }
115
116
117 /*
118  * xfs_getattr
119  */
120 STATIC int
121 xfs_getattr(
122         bhv_desc_t      *bdp,
123         vattr_t         *vap,
124         int             flags,
125         cred_t          *credp)
126 {
127         xfs_inode_t     *ip;
128         xfs_mount_t     *mp;
129         vnode_t         *vp;
130
131         vp  = BHV_TO_VNODE(bdp);
132         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
133
134         ip = XFS_BHVTOI(bdp);
135         mp = ip->i_mount;
136
137         if (XFS_FORCED_SHUTDOWN(mp))
138                 return XFS_ERROR(EIO);
139
140         if (!(flags & ATTR_LAZY))
141                 xfs_ilock(ip, XFS_ILOCK_SHARED);
142
143         vap->va_size = ip->i_d.di_size;
144         if (vap->va_mask == XFS_AT_SIZE)
145                 goto all_done;
146
147         vap->va_nblocks =
148                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
149         vap->va_nodeid = ip->i_ino;
150 #if XFS_BIG_INUMS
151         vap->va_nodeid += mp->m_inoadd;
152 #endif
153         vap->va_nlink = ip->i_d.di_nlink;
154
155         /*
156          * Quick exit for non-stat callers
157          */
158         if ((vap->va_mask &
159             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
160               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
161                 goto all_done;
162
163         /*
164          * Copy from in-core inode.
165          */
166         vap->va_type = vp->v_type;
167         vap->va_mode = ip->i_d.di_mode & MODEMASK;
168         vap->va_uid = ip->i_d.di_uid;
169         vap->va_gid = ip->i_d.di_gid;
170         vap->va_projid = ip->i_d.di_projid;
171
172         /*
173          * Check vnode type block/char vs. everything else.
174          * Do it with bitmask because that's faster than looking
175          * for multiple values individually.
176          */
177         if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
178                 vap->va_rdev = 0;
179
180                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
181
182 #if 0
183                         /* Large block sizes confuse various
184                          * user space programs, so letting the
185                          * stripe size through is not a good
186                          * idea for now.
187                          */
188                         vap->va_blocksize = mp->m_swidth ?
189                                 /*
190                                  * If the underlying volume is a stripe, then
191                                  * return the stripe width in bytes as the
192                                  * recommended I/O size.
193                                  */
194                                 (mp->m_swidth << mp->m_sb.sb_blocklog) :
195                                 /*
196                                  * Return the largest of the preferred buffer
197                                  * sizes since doing small I/Os into larger
198                                  * buffers causes buffers to be decommissioned.
199                                  * The value returned is in bytes.
200                                  */
201                                 (1 << (int)MAX(mp->m_readio_log,
202                                                mp->m_writeio_log));
203
204 #else
205                         vap->va_blocksize =
206                                 /*
207                                  * Return the largest of the preferred buffer
208                                  * sizes since doing small I/Os into larger
209                                  * buffers causes buffers to be decommissioned.
210                                  * The value returned is in bytes.
211                                  */
212                                 1 << (int)MAX(mp->m_readio_log,
213                                                mp->m_writeio_log);
214 #endif
215                 } else {
216
217                         /*
218                          * If the file blocks are being allocated from a
219                          * realtime partition, then return the inode's
220                          * realtime extent size or the realtime volume's
221                          * extent size.
222                          */
223                         vap->va_blocksize = ip->i_d.di_extsize ?
224                                 (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
225                                 (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
226                 }
227         } else {
228                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
229                 vap->va_blocksize = BLKDEV_IOSIZE;
230         }
231
232         vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
233         vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
234         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
235         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
236         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
237         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
238
239         /*
240          * Exit for stat callers.  See if any of the rest of the fields
241          * to be filled in are needed.
242          */
243         if ((vap->va_mask &
244              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
245               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
246                 goto all_done;
247
248         /*
249          * Convert di_flags to xflags.
250          */
251         vap->va_xflags = xfs_dic2xflags(&ip->i_d, ARCH_NOCONVERT);
252
253         /*
254          * Exit for inode revalidate.  See if any of the rest of
255          * the fields to be filled in are needed.
256          */
257         if ((vap->va_mask &
258              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
259               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
260                 goto all_done;
261
262         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
263         vap->va_nextents =
264                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
265                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
266                         ip->i_d.di_nextents;
267         if (ip->i_afp)
268                 vap->va_anextents =
269                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
270                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
271                                  ip->i_d.di_anextents;
272         else
273                 vap->va_anextents = 0;
274         vap->va_gen = ip->i_d.di_gen;
275
276  all_done:
277         if (!(flags & ATTR_LAZY))
278                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
279         return 0;
280 }
281
282
283 /*
284  * xfs_setattr
285  */
286 STATIC int
287 xfs_setattr(
288         bhv_desc_t              *bdp,
289         vattr_t                 *vap,
290         int                     flags,
291         cred_t                  *credp)
292 {
293         xfs_inode_t             *ip;
294         xfs_trans_t             *tp;
295         xfs_mount_t             *mp;
296         int                     mask;
297         int                     code;
298         uint                    lock_flags;
299         uint                    commit_flags=0;
300         uid_t                   uid=0, iuid=0;
301         gid_t                   gid=0, igid=0;
302         int                     timeflags = 0;
303         vnode_t                 *vp;
304         xfs_prid_t              projid=0, iprojid=0;
305         int                     mandlock_before, mandlock_after;
306         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
307         int                     file_owner;
308
309         vp = BHV_TO_VNODE(bdp);
310         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
311
312         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
313                 return XFS_ERROR(EROFS);
314
315         /*
316          * Cannot set certain attributes.
317          */
318         mask = vap->va_mask;
319         if (mask & XFS_AT_NOSET) {
320                 return XFS_ERROR(EINVAL);
321         }
322
323         ip = XFS_BHVTOI(bdp);
324         mp = ip->i_mount;
325
326         if (XFS_FORCED_SHUTDOWN(mp))
327                 return XFS_ERROR(EIO);
328
329         /*
330          * Timestamps do not need to be logged and hence do not
331          * need to be done within a transaction.
332          */
333         if (mask & XFS_AT_UPDTIMES) {
334                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
335                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
336                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
337                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
338                 xfs_ichgtime(ip, timeflags);
339                 return 0;
340         }
341
342         olddquot1 = olddquot2 = NULL;
343         udqp = gdqp = NULL;
344
345         /*
346          * If disk quotas is on, we make sure that the dquots do exist on disk,
347          * before we start any other transactions. Trying to do this later
348          * is messy. We don't care to take a readlock to look at the ids
349          * in inode here, because we can't hold it across the trans_reserve.
350          * If the IDs do change before we take the ilock, we're covered
351          * because the i_*dquot fields will get updated anyway.
352          */
353         if (XFS_IS_QUOTA_ON(mp) && (mask & (XFS_AT_UID|XFS_AT_GID))) {
354                 uint    qflags = 0;
355
356                 if (mask & XFS_AT_UID) {
357                         uid = vap->va_uid;
358                         qflags |= XFS_QMOPT_UQUOTA;
359                 } else {
360                         uid = ip->i_d.di_uid;
361                 }
362                 if (mask & XFS_AT_GID) {
363                         gid = vap->va_gid;
364                         qflags |= XFS_QMOPT_GQUOTA;
365                 }  else {
366                         gid = ip->i_d.di_gid;
367                 }
368                 /*
369                  * We take a reference when we initialize udqp and gdqp,
370                  * so it is important that we never blindly double trip on
371                  * the same variable. See xfs_create() for an example.
372                  */
373                 ASSERT(udqp == NULL);
374                 ASSERT(gdqp == NULL);
375                 code = XFS_QM_DQVOPALLOC(mp, ip, uid,gid, qflags, &udqp, &gdqp);
376                 if (code)
377                         return (code);
378         }
379
380         /*
381          * For the other attributes, we acquire the inode lock and
382          * first do an error checking pass.
383          */
384         tp = NULL;
385         lock_flags = XFS_ILOCK_EXCL;
386         if (!(mask & XFS_AT_SIZE)) {
387                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
388                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
389                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
390                         commit_flags = 0;
391                         if ((code = xfs_trans_reserve(tp, 0,
392                                                      XFS_ICHANGE_LOG_RES(mp), 0,
393                                                      0, 0))) {
394                                 lock_flags = 0;
395                                 goto error_return;
396                         }
397                 }
398         } else {
399                 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
400                     !(flags & ATTR_DMI)) {
401                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
402                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
403                                 vap->va_size, 0, dmflags, NULL);
404                         if (code) {
405                                 lock_flags = 0;
406                                 goto error_return;
407                         }
408                 }
409                 lock_flags |= XFS_IOLOCK_EXCL;
410         }
411
412         xfs_ilock(ip, lock_flags);
413
414         /* boolean: are we the file owner? */
415         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
416
417         /*
418          * Change various properties of a file.
419          * Only the owner or users with CAP_FOWNER
420          * capability may do these things.
421          */
422         if (mask &
423             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
424              XFS_AT_GID|XFS_AT_PROJID)) {
425                 /*
426                  * CAP_FOWNER overrides the following restrictions:
427                  *
428                  * The user ID of the calling process must be equal
429                  * to the file owner ID, except in cases where the
430                  * CAP_FSETID capability is applicable.
431                  */
432                 if (!file_owner && !capable(CAP_FOWNER)) {
433                         code = XFS_ERROR(EPERM);
434                         goto error_return;
435                 }
436
437                 /*
438                  * CAP_FSETID overrides the following restrictions:
439                  *
440                  * The effective user ID of the calling process shall match
441                  * the file owner when setting the set-user-ID and
442                  * set-group-ID bits on that file.
443                  *
444                  * The effective group ID or one of the supplementary group
445                  * IDs of the calling process shall match the group owner of
446                  * the file when setting the set-group-ID bit on that file
447                  */
448                 if (mask & XFS_AT_MODE) {
449                         mode_t m = 0;
450
451                         if ((vap->va_mode & S_ISUID) && !file_owner)
452                                 m |= S_ISUID;
453                         if ((vap->va_mode & S_ISGID) &&
454                             !in_group_p((gid_t)ip->i_d.di_gid))
455                                 m |= S_ISGID;
456 #if 0
457                         /* Linux allows this, Irix doesn't. */
458                         if ((vap->va_mode & S_ISVTX) && vp->v_type != VDIR)
459                                 m |= S_ISVTX;
460 #endif
461                         if (m && !capable(CAP_FSETID))
462                                 vap->va_mode &= ~m;
463                 }
464         }
465
466         /*
467          * Change file ownership.  Must be the owner or privileged.
468          * If the system was configured with the "restricted_chown"
469          * option, the owner is not permitted to give away the file,
470          * and can change the group id only to a group of which he
471          * or she is a member.
472          */
473         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
474                 /*
475                  * These IDs could have changed since we last looked at them.
476                  * But, we're assured that if the ownership did change
477                  * while we didn't have the inode locked, inode's dquot(s)
478                  * would have changed also.
479                  */
480                 iuid = ip->i_d.di_uid;
481                 iprojid = ip->i_d.di_projid;
482                 igid = ip->i_d.di_gid;
483                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
484                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
485                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
486                          iprojid;
487
488                 /*
489                  * CAP_CHOWN overrides the following restrictions:
490                  *
491                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
492                  * shall override the restriction that a process cannot
493                  * change the user ID of a file it owns and the restriction
494                  * that the group ID supplied to the chown() function
495                  * shall be equal to either the group ID or one of the
496                  * supplementary group IDs of the calling process.
497                  *
498                  * XXX: How does restricted_chown affect projid?
499                  */
500                 if (restricted_chown &&
501                     (iuid != uid || (igid != gid &&
502                                      !in_group_p((gid_t)gid))) &&
503                     !capable(CAP_CHOWN)) {
504                         code = XFS_ERROR(EPERM);
505                         goto error_return;
506                 }
507                 /*
508                  * Do a quota reservation only if uid or gid is actually
509                  * going to change.
510                  */
511                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
512                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
513                         ASSERT(tp);
514                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
515                                                 capable(CAP_FOWNER) ?
516                                                 XFS_QMOPT_FORCE_RES : 0);
517                         if (code)       /* out of quota */
518                                 goto error_return;
519                 }
520         }
521
522         /*
523          * Truncate file.  Must have write permission and not be a directory.
524          */
525         if (mask & XFS_AT_SIZE) {
526                 /* Short circuit the truncate case for zero length files */
527                 if ((vap->va_size == 0) &&
528                    (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) {
529                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
530                         lock_flags &= ~XFS_ILOCK_EXCL;
531                         if (mask & XFS_AT_CTIME)
532                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
533                         code = 0;
534                         goto error_return;
535                 }
536
537                 if (vp->v_type == VDIR) {
538                         code = XFS_ERROR(EISDIR);
539                         goto error_return;
540                 } else if (vp->v_type != VREG) {
541                         code = XFS_ERROR(EINVAL);
542                         goto error_return;
543                 }
544                 /*
545                  * Make sure that the dquots are attached to the inode.
546                  */
547                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
548                         goto error_return;
549         }
550
551         /*
552          * Change file access or modified times.
553          */
554         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
555                 if (!file_owner) {
556                         if ((flags & ATTR_UTIME) &&
557                             !capable(CAP_FOWNER)) {
558                                 code = XFS_ERROR(EPERM);
559                                 goto error_return;
560                         }
561                 }
562         }
563
564         /*
565          * Change extent size or realtime flag.
566          */
567         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
568                 /*
569                  * Can't change extent size if any extents are allocated.
570                  */
571                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
572                     (mask & XFS_AT_EXTSIZE) &&
573                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
574                      vap->va_extsize) ) {
575                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
576                         goto error_return;
577                 }
578
579                 /*
580                  * Can't set extent size unless the file is marked, or
581                  * about to be marked as a realtime file.
582                  *
583                  * This check will be removed when fixed size extents
584                  * with buffered data writes is implemented.
585                  *
586                  */
587                 if ((mask & XFS_AT_EXTSIZE)                     &&
588                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
589                      vap->va_extsize) &&
590                     (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
591                        ((mask & XFS_AT_XFLAGS) &&
592                         (vap->va_xflags & XFS_XFLAG_REALTIME))))) {
593                         code = XFS_ERROR(EINVAL);
594                         goto error_return;
595                 }
596
597                 /*
598                  * Can't change realtime flag if any extents are allocated.
599                  */
600                 if (ip->i_d.di_nextents && (mask & XFS_AT_XFLAGS) &&
601                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
602                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
603                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
604                         goto error_return;
605                 }
606                 /*
607                  * Extent size must be a multiple of the appropriate block
608                  * size, if set at all.
609                  */
610                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
611                         xfs_extlen_t    size;
612
613                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
614                             ((mask & XFS_AT_XFLAGS) &&
615                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
616                                 size = mp->m_sb.sb_rextsize <<
617                                        mp->m_sb.sb_blocklog;
618                         } else {
619                                 size = mp->m_sb.sb_blocksize;
620                         }
621                         if (vap->va_extsize % size) {
622                                 code = XFS_ERROR(EINVAL);
623                                 goto error_return;
624                         }
625                 }
626                 /*
627                  * If realtime flag is set then must have realtime data.
628                  */
629                 if ((mask & XFS_AT_XFLAGS) &&
630                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
631                         if ((mp->m_sb.sb_rblocks == 0) ||
632                             (mp->m_sb.sb_rextsize == 0) ||
633                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
634                                 code = XFS_ERROR(EINVAL);
635                                 goto error_return;
636                         }
637                 }
638
639                 /*
640                  * Can't modify an immutable/append-only file unless
641                  * we have appropriate permission.
642                  */
643                 if ((mask & XFS_AT_XFLAGS) &&
644                     (ip->i_d.di_flags &
645                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
646                      (vap->va_xflags &
647                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
648                     !capable(CAP_LINUX_IMMUTABLE)) {
649                         code = XFS_ERROR(EPERM);
650                         goto error_return;
651                 }
652         }
653
654         /*
655          * Now we can make the changes.  Before we join the inode
656          * to the transaction, if XFS_AT_SIZE is set then take care of
657          * the part of the truncation that must be done without the
658          * inode lock.  This needs to be done before joining the inode
659          * to the transaction, because the inode cannot be unlocked
660          * once it is a part of the transaction.
661          */
662         if (mask & XFS_AT_SIZE) {
663                 code = 0;
664                 if (vap->va_size > ip->i_d.di_size)
665                         code = xfs_igrow_start(ip, vap->va_size, credp);
666                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
667                 if (!code)
668                         code = xfs_itruncate_data(ip, vap->va_size);
669                 if (code) {
670                         ASSERT(tp == NULL);
671                         lock_flags &= ~XFS_ILOCK_EXCL;
672                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
673                         goto error_return;
674                 }
675                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
676                 if ((code = xfs_trans_reserve(tp, 0,
677                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
678                                              XFS_TRANS_PERM_LOG_RES,
679                                              XFS_ITRUNCATE_LOG_COUNT))) {
680                         xfs_trans_cancel(tp, 0);
681                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
682                         return code;
683                 }
684                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
685                 xfs_ilock(ip, XFS_ILOCK_EXCL);
686         }
687
688         if (tp) {
689                 xfs_trans_ijoin(tp, ip, lock_flags);
690                 xfs_trans_ihold(tp, ip);
691         }
692
693         /* determine whether mandatory locking mode changes */
694         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
695
696         /*
697          * Truncate file.  Must have write permission and not be a directory.
698          */
699         if (mask & XFS_AT_SIZE) {
700                 if (vap->va_size > ip->i_d.di_size) {
701                         xfs_igrow_finish(tp, ip, vap->va_size,
702                             !(flags & ATTR_DMI));
703                 } else if ((vap->va_size <= ip->i_d.di_size) ||
704                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
705                         /*
706                          * signal a sync transaction unless
707                          * we're truncating an already unlinked
708                          * file on a wsync filesystem
709                          */
710                         code = xfs_itruncate_finish(&tp, ip,
711                                             (xfs_fsize_t)vap->va_size,
712                                             XFS_DATA_FORK,
713                                             ((ip->i_d.di_nlink != 0 ||
714                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
715                                              ? 1 : 0));
716                         if (code) {
717                                 goto abort_return;
718                         }
719                 }
720                 /*
721                  * Have to do this even if the file's size doesn't change.
722                  */
723                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
724         }
725
726         /*
727          * Change file access modes.
728          */
729         if (mask & XFS_AT_MODE) {
730                 ip->i_d.di_mode &= S_IFMT;
731                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
732
733                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
734                 timeflags |= XFS_ICHGTIME_CHG;
735         }
736
737         /*
738          * Change file ownership.  Must be the owner or privileged.
739          * If the system was configured with the "restricted_chown"
740          * option, the owner is not permitted to give away the file,
741          * and can change the group id only to a group of which he
742          * or she is a member.
743          */
744         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
745                 /*
746                  * CAP_FSETID overrides the following restrictions:
747                  *
748                  * The set-user-ID and set-group-ID bits of a file will be
749                  * cleared upon successful return from chown()
750                  */
751                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
752                     !capable(CAP_FSETID)) {
753                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
754                 }
755
756                 /*
757                  * Change the ownerships and register quota modifications
758                  * in the transaction.
759                  */
760                 if (iuid != uid) {
761                         if (XFS_IS_UQUOTA_ON(mp)) {
762                                 ASSERT(mask & XFS_AT_UID);
763                                 ASSERT(udqp);
764                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
765                                                         &ip->i_udquot, udqp);
766                         }
767                         ip->i_d.di_uid = uid;
768                 }
769                 if (igid != gid) {
770                         if (XFS_IS_GQUOTA_ON(mp)) {
771                                 ASSERT(mask & XFS_AT_GID);
772                                 ASSERT(gdqp);
773                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
774                                                         &ip->i_gdquot, gdqp);
775                         }
776                         ip->i_d.di_gid = gid;
777                 }
778                 if (iprojid != projid) {
779                         ip->i_d.di_projid = projid;
780                         /*
781                          * We may have to rev the inode as well as
782                          * the superblock version number since projids didn't
783                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
784                          */
785                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
786                                 xfs_bump_ino_vers2(tp, ip);
787                 }
788
789                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
790                 timeflags |= XFS_ICHGTIME_CHG;
791         }
792
793
794         /*
795          * Change file access or modified times.
796          */
797         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
798                 if (mask & XFS_AT_ATIME) {
799                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
800                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
801                         ip->i_update_core = 1;
802                         timeflags &= ~XFS_ICHGTIME_ACC;
803                 }
804                 if (mask & XFS_AT_MTIME) {
805                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
806                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
807                         timeflags &= ~XFS_ICHGTIME_MOD;
808                         timeflags |= XFS_ICHGTIME_CHG;
809                 }
810                 if (tp && (flags & ATTR_UTIME))
811                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
812         }
813
814         /*
815          * Change XFS-added attributes.
816          */
817         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
818                 if (mask & XFS_AT_EXTSIZE) {
819                         /*
820                          * Converting bytes to fs blocks.
821                          */
822                         ip->i_d.di_extsize = vap->va_extsize >>
823                                 mp->m_sb.sb_blocklog;
824                 }
825                 if (mask & XFS_AT_XFLAGS) {
826                         /* can't set PREALLOC this way, just preserve it */
827                         ip->i_d.di_flags =
828                                 (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
829                         if (vap->va_xflags & XFS_XFLAG_REALTIME &&
830                             (ip->i_d.di_mode & S_IFMT) == S_IFREG) {
831                                 ip->i_d.di_flags |= XFS_DIFLAG_REALTIME;
832                                 ip->i_iocore.io_flags |= XFS_IOCORE_RT;
833                         } else {
834                                 ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
835                         }
836                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
837                                 ip->i_d.di_flags |= XFS_DIFLAG_IMMUTABLE;
838                         if (vap->va_xflags & XFS_XFLAG_APPEND)
839                                 ip->i_d.di_flags |= XFS_DIFLAG_APPEND;
840                         if (vap->va_xflags & XFS_XFLAG_SYNC)
841                                 ip->i_d.di_flags |= XFS_DIFLAG_SYNC;
842                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
843                                 ip->i_d.di_flags |= XFS_DIFLAG_NOATIME;
844                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
845                                 ip->i_d.di_flags |= XFS_DIFLAG_NODUMP;
846                         if ((vap->va_xflags & XFS_XFLAG_RTINHERIT) &&
847                             (ip->i_d.di_mode & S_IFMT) == S_IFDIR)
848                                 ip->i_d.di_flags |= XFS_DIFLAG_RTINHERIT;
849                 }
850                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
851                 timeflags |= XFS_ICHGTIME_CHG;
852         }
853
854         /*
855          * Change file inode change time only if XFS_AT_CTIME set
856          * AND we have been called by a DMI function.
857          */
858
859         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
860                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
861                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
862                 ip->i_update_core = 1;
863                 timeflags &= ~XFS_ICHGTIME_CHG;
864         }
865
866         /*
867          * Send out timestamp changes that need to be set to the
868          * current time.  Not done when called by a DMI function.
869          */
870         if (timeflags && !(flags & ATTR_DMI))
871                 xfs_ichgtime(ip, timeflags);
872
873         XFS_STATS_INC(xs_ig_attrchg);
874
875         /*
876          * If this is a synchronous mount, make sure that the
877          * transaction goes to disk before returning to the user.
878          * This is slightly sub-optimal in that truncates require
879          * two sync transactions instead of one for wsync filesytems.
880          * One for the truncate and one for the timestamps since we
881          * don't want to change the timestamps unless we're sure the
882          * truncate worked.  Truncates are less than 1% of the laddis
883          * mix so this probably isn't worth the trouble to optimize.
884          */
885         code = 0;
886         if (tp) {
887                 if (mp->m_flags & XFS_MOUNT_WSYNC)
888                         xfs_trans_set_sync(tp);
889
890                 code = xfs_trans_commit(tp, commit_flags, NULL);
891         }
892
893         /*
894          * If the (regular) file's mandatory locking mode changed, then
895          * notify the vnode.  We do this under the inode lock to prevent
896          * racing calls to vop_vnode_change.
897          */
898         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
899         if (mandlock_before != mandlock_after) {
900                 VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_ENF_LOCKING,
901                                  mandlock_after);
902         }
903
904         xfs_iunlock(ip, lock_flags);
905
906         /*
907          * Release any dquot(s) the inode had kept before chown.
908          */
909         XFS_QM_DQRELE(mp, olddquot1);
910         XFS_QM_DQRELE(mp, olddquot2);
911         XFS_QM_DQRELE(mp, udqp);
912         XFS_QM_DQRELE(mp, gdqp);
913
914         if (code) {
915                 return code;
916         }
917
918         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
919             !(flags & ATTR_DMI)) {
920                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
921                                         NULL, DM_RIGHT_NULL, NULL, NULL,
922                                         0, 0, AT_DELAY_FLAG(flags));
923         }
924         return 0;
925
926  abort_return:
927         commit_flags |= XFS_TRANS_ABORT;
928         /* FALLTHROUGH */
929  error_return:
930         XFS_QM_DQRELE(mp, udqp);
931         XFS_QM_DQRELE(mp, gdqp);
932         if (tp) {
933                 xfs_trans_cancel(tp, commit_flags);
934         }
935         if (lock_flags != 0) {
936                 xfs_iunlock(ip, lock_flags);
937         }
938         return code;
939 }
940
941
942 /*
943  * xfs_access
944  * Null conversion from vnode mode bits to inode mode bits, as in efs.
945  */
946 STATIC int
947 xfs_access(
948         bhv_desc_t      *bdp,
949         int             mode,
950         cred_t          *credp)
951 {
952         xfs_inode_t     *ip;
953         int             error;
954
955         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
956                                                (inst_t *)__return_address);
957
958         ip = XFS_BHVTOI(bdp);
959         xfs_ilock(ip, XFS_ILOCK_SHARED);
960         error = xfs_iaccess(ip, mode, credp);
961         xfs_iunlock(ip, XFS_ILOCK_SHARED);
962         return error;
963 }
964
965
966 /*
967  * xfs_readlink
968  *
969  */
970 STATIC int
971 xfs_readlink(
972         bhv_desc_t      *bdp,
973         uio_t           *uiop,
974         int             ioflags,
975         cred_t          *credp)
976 {
977         xfs_inode_t     *ip;
978         int             count;
979         xfs_off_t       offset;
980         int             pathlen;
981         vnode_t         *vp;
982         int             error = 0;
983         xfs_mount_t     *mp;
984         int             nmaps;
985         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
986         xfs_daddr_t     d;
987         int             byte_cnt;
988         int             n;
989         xfs_buf_t       *bp;
990
991         vp = BHV_TO_VNODE(bdp);
992         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
993
994         ip = XFS_BHVTOI(bdp);
995         mp = ip->i_mount;
996
997         if (XFS_FORCED_SHUTDOWN(mp))
998                 return XFS_ERROR(EIO);
999
1000         xfs_ilock(ip, XFS_ILOCK_SHARED);
1001
1002         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
1003
1004         offset = uiop->uio_offset;
1005         count = uiop->uio_resid;
1006
1007         if (offset < 0) {
1008                 error = XFS_ERROR(EINVAL);
1009                 goto error_return;
1010         }
1011         if (count <= 0) {
1012                 error = 0;
1013                 goto error_return;
1014         }
1015
1016         if (!(ioflags & IO_INVIS)) {
1017                 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
1018         }
1019
1020         /*
1021          * See if the symlink is stored inline.
1022          */
1023         pathlen = (int)ip->i_d.di_size;
1024
1025         if (ip->i_df.if_flags & XFS_IFINLINE) {
1026                 error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
1027         }
1028         else {
1029                 /*
1030                  * Symlink not inline.  Call bmap to get it in.
1031                  */
1032                 nmaps = SYMLINK_MAPS;
1033
1034                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1035                                   0, NULL, 0, mval, &nmaps, NULL);
1036
1037                 if (error) {
1038                         goto error_return;
1039                 }
1040
1041                 for (n = 0; n < nmaps; n++) {
1042                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1043                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1044                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1045                                       BTOBB(byte_cnt), 0);
1046                         error = XFS_BUF_GETERROR(bp);
1047                         if (error) {
1048                                 xfs_ioerror_alert("xfs_readlink",
1049                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1050                                 xfs_buf_relse(bp);
1051                                 goto error_return;
1052                         }
1053                         if (pathlen < byte_cnt)
1054                                 byte_cnt = pathlen;
1055                         pathlen -= byte_cnt;
1056
1057                         error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1058                         xfs_buf_relse (bp);
1059                 }
1060
1061         }
1062
1063
1064 error_return:
1065
1066         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1067
1068         return error;
1069 }
1070
1071
1072 /*
1073  * xfs_fsync
1074  *
1075  * This is called to sync the inode and its data out to disk.
1076  * We need to hold the I/O lock while flushing the data, and
1077  * the inode lock while flushing the inode.  The inode lock CANNOT
1078  * be held while flushing the data, so acquire after we're done
1079  * with that.
1080  */
1081 STATIC int
1082 xfs_fsync(
1083         bhv_desc_t      *bdp,
1084         int             flag,
1085         cred_t          *credp,
1086         xfs_off_t       start,
1087         xfs_off_t       stop)
1088 {
1089         xfs_inode_t     *ip;
1090         xfs_trans_t     *tp;
1091         int             error;
1092
1093         vn_trace_entry(BHV_TO_VNODE(bdp),
1094                         __FUNCTION__, (inst_t *)__return_address);
1095
1096         ip = XFS_BHVTOI(bdp);
1097
1098         ASSERT(start >= 0 && stop >= -1);
1099
1100         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1101                 return XFS_ERROR(EIO);
1102
1103         /*
1104          * We always need to make sure that the required inode state
1105          * is safe on disk.  The vnode might be clean but because
1106          * of committed transactions that haven't hit the disk yet.
1107          * Likewise, there could be unflushed non-transactional
1108          * changes to the inode core that have to go to disk.
1109          *
1110          * The following code depends on one assumption:  that
1111          * any transaction that changes an inode logs the core
1112          * because it has to change some field in the inode core
1113          * (typically nextents or nblocks).  That assumption
1114          * implies that any transactions against an inode will
1115          * catch any non-transactional updates.  If inode-altering
1116          * transactions exist that violate this assumption, the
1117          * code breaks.  Right now, it figures that if the involved
1118          * update_* field is clear and the inode is unpinned, the
1119          * inode is clean.  Either it's been flushed or it's been
1120          * committed and the commit has hit the disk unpinning the inode.
1121          * (Note that xfs_inode_item_format() called at commit clears
1122          * the update_* fields.)
1123          */
1124         xfs_ilock(ip, XFS_ILOCK_SHARED);
1125
1126         /* If we are flushing data then we care about update_size
1127          * being set, otherwise we care about update_core
1128          */
1129         if ((flag & FSYNC_DATA) ?
1130                         (ip->i_update_size == 0) :
1131                         (ip->i_update_core == 0)) {
1132                 /*
1133                  * Timestamps/size haven't changed since last inode
1134                  * flush or inode transaction commit.  That means
1135                  * either nothing got written or a transaction
1136                  * committed which caught the updates.  If the
1137                  * latter happened and the transaction hasn't
1138                  * hit the disk yet, the inode will be still
1139                  * be pinned.  If it is, force the log.
1140                  */
1141
1142                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1143
1144                 if (xfs_ipincount(ip)) {
1145                         xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1146                                       XFS_LOG_FORCE |
1147                                       ((flag & FSYNC_WAIT)
1148                                        ? XFS_LOG_SYNC : 0));
1149                 }
1150                 error = 0;
1151         } else  {
1152                 /*
1153                  * Kick off a transaction to log the inode
1154                  * core to get the updates.  Make it
1155                  * sync if FSYNC_WAIT is passed in (which
1156                  * is done by everybody but specfs).  The
1157                  * sync transaction will also force the log.
1158                  */
1159                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1160                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1161                 if ((error = xfs_trans_reserve(tp, 0,
1162                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1163                                 0, 0, 0)))  {
1164                         xfs_trans_cancel(tp, 0);
1165                         return error;
1166                 }
1167                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1168
1169                 /*
1170                  * Note - it's possible that we might have pushed
1171                  * ourselves out of the way during trans_reserve
1172                  * which would flush the inode.  But there's no
1173                  * guarantee that the inode buffer has actually
1174                  * gone out yet (it's delwri).  Plus the buffer
1175                  * could be pinned anyway if it's part of an
1176                  * inode in another recent transaction.  So we
1177                  * play it safe and fire off the transaction anyway.
1178                  */
1179                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1180                 xfs_trans_ihold(tp, ip);
1181                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1182                 if (flag & FSYNC_WAIT)
1183                         xfs_trans_set_sync(tp);
1184                 error = xfs_trans_commit(tp, 0, NULL);
1185
1186                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1187         }
1188         return error;
1189 }
1190
1191 /*
1192  * This is called by xfs_inactive to free any blocks beyond eof,
1193  * when the link count isn't zero.
1194  */
1195 STATIC int
1196 xfs_inactive_free_eofblocks(
1197         xfs_mount_t     *mp,
1198         xfs_inode_t     *ip)
1199 {
1200         xfs_trans_t     *tp;
1201         int             error;
1202         xfs_fileoff_t   end_fsb;
1203         xfs_fileoff_t   last_fsb;
1204         xfs_filblks_t   map_len;
1205         int             nimaps;
1206         xfs_bmbt_irec_t imap;
1207
1208         /*
1209          * Figure out if there are any blocks beyond the end
1210          * of the file.  If not, then there is nothing to do.
1211          */
1212         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size));
1213         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1214         map_len = last_fsb - end_fsb;
1215         if (map_len <= 0)
1216                 return (0);
1217
1218         nimaps = 1;
1219         xfs_ilock(ip, XFS_ILOCK_SHARED);
1220         error = xfs_bmapi(NULL, ip, end_fsb, map_len, 0,
1221                           NULL, 0, &imap, &nimaps, NULL);
1222         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1223
1224         if (!error && (nimaps != 0) &&
1225             (imap.br_startblock != HOLESTARTBLOCK)) {
1226                 /*
1227                  * Attach the dquots to the inode up front.
1228                  */
1229                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1230                         return (error);
1231
1232                 /*
1233                  * There are blocks after the end of file.
1234                  * Free them up now by truncating the file to
1235                  * its current size.
1236                  */
1237                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1238
1239                 /*
1240                  * Do the xfs_itruncate_start() call before
1241                  * reserving any log space because
1242                  * itruncate_start will call into the buffer
1243                  * cache and we can't
1244                  * do that within a transaction.
1245                  */
1246                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1247                 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1248                                     ip->i_d.di_size);
1249
1250                 error = xfs_trans_reserve(tp, 0,
1251                                           XFS_ITRUNCATE_LOG_RES(mp),
1252                                           0, XFS_TRANS_PERM_LOG_RES,
1253                                           XFS_ITRUNCATE_LOG_COUNT);
1254                 if (error) {
1255                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1256                         xfs_trans_cancel(tp, 0);
1257                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1258                         return (error);
1259                 }
1260
1261                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1262                 xfs_trans_ijoin(tp, ip,
1263                                 XFS_IOLOCK_EXCL |
1264                                 XFS_ILOCK_EXCL);
1265                 xfs_trans_ihold(tp, ip);
1266
1267                 error = xfs_itruncate_finish(&tp, ip,
1268                                              ip->i_d.di_size,
1269                                              XFS_DATA_FORK,
1270                                              0);
1271                 /*
1272                  * If we get an error at this point we
1273                  * simply don't bother truncating the file.
1274                  */
1275                 if (error) {
1276                         xfs_trans_cancel(tp,
1277                                          (XFS_TRANS_RELEASE_LOG_RES |
1278                                           XFS_TRANS_ABORT));
1279                 } else {
1280                         error = xfs_trans_commit(tp,
1281                                                 XFS_TRANS_RELEASE_LOG_RES,
1282                                                 NULL);
1283                 }
1284                 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1285         }
1286         return (error);
1287 }
1288
1289 /*
1290  * Free a symlink that has blocks associated with it.
1291  */
1292 STATIC int
1293 xfs_inactive_symlink_rmt(
1294         xfs_inode_t     *ip,
1295         xfs_trans_t     **tpp)
1296 {
1297         xfs_buf_t       *bp;
1298         int             committed;
1299         int             done;
1300         int             error;
1301         xfs_fsblock_t   first_block;
1302         xfs_bmap_free_t free_list;
1303         int             i;
1304         xfs_mount_t     *mp;
1305         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1306         int             nmaps;
1307         xfs_trans_t     *ntp;
1308         int             size;
1309         xfs_trans_t     *tp;
1310
1311         tp = *tpp;
1312         mp = ip->i_mount;
1313         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1314         /*
1315          * We're freeing a symlink that has some
1316          * blocks allocated to it.  Free the
1317          * blocks here.  We know that we've got
1318          * either 1 or 2 extents and that we can
1319          * free them all in one bunmapi call.
1320          */
1321         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1322         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1323                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1324                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1325                 xfs_trans_cancel(tp, 0);
1326                 *tpp = NULL;
1327                 return error;
1328         }
1329         /*
1330          * Lock the inode, fix the size, and join it to the transaction.
1331          * Hold it so in the normal path, we still have it locked for
1332          * the second transaction.  In the error paths we need it
1333          * held so the cancel won't rele it, see below.
1334          */
1335         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1336         size = (int)ip->i_d.di_size;
1337         ip->i_d.di_size = 0;
1338         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1339         xfs_trans_ihold(tp, ip);
1340         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1341         /*
1342          * Find the block(s) so we can inval and unmap them.
1343          */
1344         done = 0;
1345         XFS_BMAP_INIT(&free_list, &first_block);
1346         nmaps = sizeof(mval) / sizeof(mval[0]);
1347         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1348                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1349                         &free_list)))
1350                 goto error0;
1351         /*
1352          * Invalidate the block(s).
1353          */
1354         for (i = 0; i < nmaps; i++) {
1355                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1356                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1357                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1358                 xfs_trans_binval(tp, bp);
1359         }
1360         /*
1361          * Unmap the dead block(s) to the free_list.
1362          */
1363         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1364                         &first_block, &free_list, &done)))
1365                 goto error1;
1366         ASSERT(done);
1367         /*
1368          * Commit the first transaction.  This logs the EFI and the inode.
1369          */
1370         if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed)))
1371                 goto error1;
1372         /*
1373          * The transaction must have been committed, since there were
1374          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1375          * The new tp has the extent freeing and EFDs.
1376          */
1377         ASSERT(committed);
1378         /*
1379          * The first xact was committed, so add the inode to the new one.
1380          * Mark it dirty so it will be logged and moved forward in the log as
1381          * part of every commit.
1382          */
1383         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1384         xfs_trans_ihold(tp, ip);
1385         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1386         /*
1387          * Get a new, empty transaction to return to our caller.
1388          */
1389         ntp = xfs_trans_dup(tp);
1390         /*
1391          * Commit the transaction containing extent freeing and EFD's.
1392          * If we get an error on the commit here or on the reserve below,
1393          * we need to unlock the inode since the new transaction doesn't
1394          * have the inode attached.
1395          */
1396         error = xfs_trans_commit(tp, 0, NULL);
1397         tp = ntp;
1398         if (error) {
1399                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1400                 goto error0;
1401         }
1402         /*
1403          * Remove the memory for extent descriptions (just bookkeeping).
1404          */
1405         if (ip->i_df.if_bytes)
1406                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1407         ASSERT(ip->i_df.if_bytes == 0);
1408         /*
1409          * Put an itruncate log reservation in the new transaction
1410          * for our caller.
1411          */
1412         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1413                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1414                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1415                 goto error0;
1416         }
1417         /*
1418          * Return with the inode locked but not joined to the transaction.
1419          */
1420         *tpp = tp;
1421         return 0;
1422
1423  error1:
1424         xfs_bmap_cancel(&free_list);
1425  error0:
1426         /*
1427          * Have to come here with the inode locked and either
1428          * (held and in the transaction) or (not in the transaction).
1429          * If the inode isn't held then cancel would iput it, but
1430          * that's wrong since this is inactive and the vnode ref
1431          * count is 0 already.
1432          * Cancel won't do anything to the inode if held, but it still
1433          * needs to be locked until the cancel is done, if it was
1434          * joined to the transaction.
1435          */
1436         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1437         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1438         *tpp = NULL;
1439         return error;
1440
1441 }
1442
1443 STATIC int
1444 xfs_inactive_symlink_local(
1445         xfs_inode_t     *ip,
1446         xfs_trans_t     **tpp)
1447 {
1448         int             error;
1449
1450         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1451         /*
1452          * We're freeing a symlink which fit into
1453          * the inode.  Just free the memory used
1454          * to hold the old symlink.
1455          */
1456         error = xfs_trans_reserve(*tpp, 0,
1457                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1458                                   0, XFS_TRANS_PERM_LOG_RES,
1459                                   XFS_ITRUNCATE_LOG_COUNT);
1460
1461         if (error) {
1462                 xfs_trans_cancel(*tpp, 0);
1463                 *tpp = NULL;
1464                 return (error);
1465         }
1466         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1467
1468         /*
1469          * Zero length symlinks _can_ exist.
1470          */
1471         if (ip->i_df.if_bytes > 0) {
1472                 xfs_idata_realloc(ip,
1473                                   -(ip->i_df.if_bytes),
1474                                   XFS_DATA_FORK);
1475                 ASSERT(ip->i_df.if_bytes == 0);
1476         }
1477         return (0);
1478 }
1479
1480 /*
1481  *
1482  */
1483 STATIC int
1484 xfs_inactive_attrs(
1485         xfs_inode_t     *ip,
1486         xfs_trans_t     **tpp)
1487 {
1488         xfs_trans_t     *tp;
1489         int             error;
1490         xfs_mount_t     *mp;
1491
1492         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1493         tp = *tpp;
1494         mp = ip->i_mount;
1495         ASSERT(ip->i_d.di_forkoff != 0);
1496         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1497         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1498
1499         error = xfs_attr_inactive(ip);
1500         if (error) {
1501                 *tpp = NULL;
1502                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1503                 return (error); /* goto out*/
1504         }
1505
1506         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1507         error = xfs_trans_reserve(tp, 0,
1508                                   XFS_IFREE_LOG_RES(mp),
1509                                   0, XFS_TRANS_PERM_LOG_RES,
1510                                   XFS_INACTIVE_LOG_COUNT);
1511         if (error) {
1512                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1513                 xfs_trans_cancel(tp, 0);
1514                 *tpp = NULL;
1515                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1516                 return (error);
1517         }
1518
1519         xfs_ilock(ip, XFS_ILOCK_EXCL);
1520         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1521         xfs_trans_ihold(tp, ip);
1522         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1523
1524         ASSERT(ip->i_d.di_anextents == 0);
1525
1526         *tpp = tp;
1527         return (0);
1528 }
1529
1530 STATIC int
1531 xfs_release(
1532         bhv_desc_t      *bdp)
1533 {
1534         xfs_inode_t     *ip;
1535         vnode_t         *vp;
1536         xfs_mount_t     *mp;
1537         int             error;
1538
1539         vp = BHV_TO_VNODE(bdp);
1540         ip = XFS_BHVTOI(bdp);
1541
1542         if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) {
1543                 return 0;
1544         }
1545
1546         /* If this is a read-only mount, don't do this (would generate I/O) */
1547         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1548                 return 0;
1549
1550 #ifdef HAVE_REFCACHE
1551         /* If we are in the NFS reference cache then don't do this now */
1552         if (ip->i_refcache)
1553                 return 0;
1554 #endif
1555
1556         mp = ip->i_mount;
1557
1558         if (ip->i_d.di_nlink != 0) {
1559                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1560                      ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
1561                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1562                     (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) {
1563                         if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1564                                 return (error);
1565                         /* Update linux inode block count after free above */
1566                         LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1567                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1568                 }
1569         }
1570
1571         return 0;
1572 }
1573
1574 /*
1575  * xfs_inactive
1576  *
1577  * This is called when the vnode reference count for the vnode
1578  * goes to zero.  If the file has been unlinked, then it must
1579  * now be truncated.  Also, we clear all of the read-ahead state
1580  * kept for the inode here since the file is now closed.
1581  */
1582 STATIC int
1583 xfs_inactive(
1584         bhv_desc_t      *bdp,
1585         cred_t          *credp)
1586 {
1587         xfs_inode_t     *ip;
1588         vnode_t         *vp;
1589         xfs_bmap_free_t free_list; 
1590         xfs_fsblock_t   first_block;
1591         int             committed;
1592         xfs_trans_t     *tp;
1593         xfs_mount_t     *mp;
1594         int             error;
1595         int             truncate;
1596
1597         vp = BHV_TO_VNODE(bdp);
1598         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1599
1600         ip = XFS_BHVTOI(bdp);
1601
1602         /*
1603          * If the inode is already free, then there can be nothing
1604          * to clean up here.
1605          */
1606         if (ip->i_d.di_mode == 0) {
1607                 ASSERT(ip->i_df.if_real_bytes == 0);
1608                 ASSERT(ip->i_df.if_broot_bytes == 0);
1609                 return VN_INACTIVE_CACHE;
1610         }
1611
1612         /*
1613          * Only do a truncate if it's a regular file with
1614          * some actual space in it.  It's OK to look at the
1615          * inode's fields without the lock because we're the
1616          * only one with a reference to the inode.
1617          */
1618         truncate = ((ip->i_d.di_nlink == 0) &&
1619             ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) &&
1620             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1621
1622         mp = ip->i_mount;
1623
1624         if (ip->i_d.di_nlink == 0 &&
1625             DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1626                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1627         }
1628
1629         error = 0;
1630
1631         /* If this is a read-only mount, don't do this (would generate I/O) */
1632         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1633                 goto out;
1634
1635         if (ip->i_d.di_nlink != 0) {
1636                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1637                      ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) &&
1638                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1639                     (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) ||
1640                      (ip->i_delayed_blks != 0))) {
1641                         if ((error = xfs_inactive_free_eofblocks(mp, ip)))
1642                                 return (VN_INACTIVE_CACHE);
1643                         /* Update linux inode block count after free above */
1644                         LINVFS_GET_IP(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1645                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1646                 }
1647                 goto out;
1648         }
1649
1650         ASSERT(ip->i_d.di_nlink == 0);
1651
1652         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1653                 return (VN_INACTIVE_CACHE);
1654
1655         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1656         if (truncate) {
1657                 /*
1658                  * Do the xfs_itruncate_start() call before
1659                  * reserving any log space because itruncate_start
1660                  * will call into the buffer cache and we can't
1661                  * do that within a transaction.
1662                  */
1663                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1664
1665                 xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1666
1667                 error = xfs_trans_reserve(tp, 0,
1668                                           XFS_ITRUNCATE_LOG_RES(mp),
1669                                           0, XFS_TRANS_PERM_LOG_RES,
1670                                           XFS_ITRUNCATE_LOG_COUNT);
1671                 if (error) {
1672                         /* Don't call itruncate_cleanup */
1673                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1674                         xfs_trans_cancel(tp, 0);
1675                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1676                         return (VN_INACTIVE_CACHE);
1677                 }
1678
1679                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1680                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1681                 xfs_trans_ihold(tp, ip);
1682
1683                 /*
1684                  * normally, we have to run xfs_itruncate_finish sync.
1685                  * But if filesystem is wsync and we're in the inactive
1686                  * path, then we know that nlink == 0, and that the
1687                  * xaction that made nlink == 0 is permanently committed
1688                  * since xfs_remove runs as a synchronous transaction.
1689                  */
1690                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1691                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1692
1693                 if (error) {
1694                         xfs_trans_cancel(tp,
1695                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1696                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1697                         return (VN_INACTIVE_CACHE);
1698                 }
1699         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1700
1701                 /*
1702                  * If we get an error while cleaning up a
1703                  * symlink we bail out.
1704                  */
1705                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1706                         xfs_inactive_symlink_rmt(ip, &tp) :
1707                         xfs_inactive_symlink_local(ip, &tp);
1708
1709                 if (error) {
1710                         ASSERT(tp == NULL);
1711                         return (VN_INACTIVE_CACHE);
1712                 }
1713
1714                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1715                 xfs_trans_ihold(tp, ip);
1716         } else {
1717                 error = xfs_trans_reserve(tp, 0,
1718                                           XFS_IFREE_LOG_RES(mp),
1719                                           0, XFS_TRANS_PERM_LOG_RES,
1720                                           XFS_INACTIVE_LOG_COUNT);
1721                 if (error) {
1722                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1723                         xfs_trans_cancel(tp, 0);
1724                         return (VN_INACTIVE_CACHE);
1725                 }
1726
1727                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1728                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1729                 xfs_trans_ihold(tp, ip);
1730         }
1731
1732         /*
1733          * If there are attributes associated with the file
1734          * then blow them away now.  The code calls a routine
1735          * that recursively deconstructs the attribute fork.
1736          * We need to just commit the current transaction
1737          * because we can't use it for xfs_attr_inactive().
1738          */
1739         if (ip->i_d.di_anextents > 0) {
1740                 error = xfs_inactive_attrs(ip, &tp);
1741                 /*
1742                  * If we got an error, the transaction is already
1743                  * cancelled, and the inode is unlocked. Just get out.
1744                  */
1745                  if (error)
1746                          return (VN_INACTIVE_CACHE);
1747         } else if (ip->i_afp) {
1748                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1749         }
1750
1751         /*
1752          * Free the inode.
1753          */
1754         XFS_BMAP_INIT(&free_list, &first_block);
1755         error = xfs_ifree(tp, ip, &free_list);
1756         if (error) {
1757                 /*
1758                  * If we fail to free the inode, shut down.  The cancel
1759                  * might do that, we need to make sure.  Otherwise the
1760                  * inode might be lost for a long time or forever.
1761                  */
1762                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1763                         cmn_err(CE_NOTE,
1764                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1765                                 error, mp->m_fsname);
1766                         xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR);
1767                 }
1768                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1769         } else {
1770                 /*
1771                  * Credit the quota account(s). The inode is gone.
1772                  */
1773                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1774
1775                 /*
1776                  * Just ignore errors at this point.  There is
1777                  * nothing we can do except to try to keep going.
1778                  */
1779                 (void) xfs_bmap_finish(&tp,  &free_list, first_block,
1780                                        &committed);
1781                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
1782         }
1783         /*
1784          * Release the dquots held by inode, if any.
1785          */
1786         XFS_QM_DQDETACH(mp, ip);
1787
1788         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1789
1790  out:
1791         return VN_INACTIVE_CACHE;
1792 }
1793
1794
1795 /*
1796  * xfs_lookup
1797  */
1798 STATIC int
1799 xfs_lookup(
1800         bhv_desc_t              *dir_bdp,
1801         vname_t                 *dentry,
1802         vnode_t                 **vpp,
1803         int                     flags,
1804         vnode_t                 *rdir,
1805         cred_t                  *credp)
1806 {
1807         xfs_inode_t             *dp, *ip;
1808         xfs_ino_t               e_inum;
1809         int                     error;
1810         uint                    lock_mode;
1811         vnode_t                 *dir_vp;
1812
1813         dir_vp = BHV_TO_VNODE(dir_bdp);
1814         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1815
1816         dp = XFS_BHVTOI(dir_bdp);
1817
1818         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1819                 return XFS_ERROR(EIO);
1820
1821         lock_mode = xfs_ilock_map_shared(dp);
1822         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1823         if (!error) {
1824                 *vpp = XFS_ITOV(ip);
1825                 ITRACE(ip);
1826         }
1827         xfs_iunlock_map_shared(dp, lock_mode);
1828         return error;
1829 }
1830
1831
1832 /*
1833  * xfs_create (create a new file).
1834  */
1835 STATIC int
1836 xfs_create(
1837         bhv_desc_t              *dir_bdp,
1838         vname_t                 *dentry,
1839         vattr_t                 *vap,
1840         vnode_t                 **vpp,
1841         cred_t                  *credp)
1842 {
1843         char                    *name = VNAME(dentry);
1844         vnode_t                 *dir_vp;
1845         xfs_inode_t             *dp, *ip;
1846         vnode_t                 *vp=NULL;
1847         xfs_trans_t             *tp;
1848         xfs_mount_t             *mp;
1849         xfs_dev_t               rdev;
1850         int                     error;
1851         xfs_bmap_free_t         free_list;
1852         xfs_fsblock_t           first_block;
1853         boolean_t               dp_joined_to_trans;
1854         int                     dm_event_sent = 0;
1855         uint                    cancel_flags;
1856         int                     committed;
1857         xfs_prid_t              prid;
1858         struct xfs_dquot        *udqp, *gdqp;
1859         uint                    resblks;
1860         int                     dm_di_mode;
1861         int                     namelen;
1862
1863         ASSERT(!*vpp);
1864         dir_vp = BHV_TO_VNODE(dir_bdp);
1865         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1866
1867         dp = XFS_BHVTOI(dir_bdp);
1868         mp = dp->i_mount;
1869
1870         dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
1871         namelen = VNAMELEN(dentry);
1872
1873         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1874                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1875                                 dir_vp, DM_RIGHT_NULL, NULL,
1876                                 DM_RIGHT_NULL, name, NULL,
1877                                 dm_di_mode, 0, 0);
1878
1879                 if (error)
1880                         return error;
1881                 dm_event_sent = 1;
1882         }
1883
1884         if (XFS_FORCED_SHUTDOWN(mp))
1885                 return XFS_ERROR(EIO);
1886
1887         /* Return through std_return after this point. */
1888
1889         udqp = gdqp = NULL;
1890         if (vap->va_mask & XFS_AT_PROJID)
1891                 prid = (xfs_prid_t)vap->va_projid;
1892         else
1893                 prid = (xfs_prid_t)dfltprid;
1894
1895         /*
1896          * Make sure that we have allocated dquot(s) on disk.
1897          */
1898         error = XFS_QM_DQVOPALLOC(mp, dp,
1899                         current_fsuid(credp), current_fsgid(credp),
1900                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1901         if (error)
1902                 goto std_return;
1903
1904         ip = NULL;
1905         dp_joined_to_trans = B_FALSE;
1906
1907         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1908         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1909         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1910         /*
1911          * Initially assume that the file does not exist and
1912          * reserve the resources for that case.  If that is not
1913          * the case we'll drop the one we have and get a more
1914          * appropriate transaction later.
1915          */
1916         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1917                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1918         if (error == ENOSPC) {
1919                 resblks = 0;
1920                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1921                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1922         }
1923         if (error) {
1924                 cancel_flags = 0;
1925                 dp = NULL;
1926                 goto error_return;
1927         }
1928
1929         xfs_ilock(dp, XFS_ILOCK_EXCL);
1930
1931         XFS_BMAP_INIT(&free_list, &first_block);
1932
1933         ASSERT(ip == NULL);
1934
1935         /*
1936          * Reserve disk quota and the inode.
1937          */
1938         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1939         if (error)
1940                 goto error_return;
1941
1942         if (resblks == 0 &&
1943             (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
1944                 goto error_return;
1945         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1946         error = xfs_dir_ialloc(&tp, dp,
1947                         MAKEIMODE(vap->va_type,vap->va_mode), 1,
1948                         rdev, credp, prid, resblks > 0,
1949                         &ip, &committed);
1950         if (error) {
1951                 if (error == ENOSPC)
1952                         goto error_return;
1953                 goto abort_return;
1954         }
1955         ITRACE(ip);
1956
1957         /*
1958          * At this point, we've gotten a newly allocated inode.
1959          * It is locked (and joined to the transaction).
1960          */
1961
1962         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
1963
1964         /*
1965          * Now we join the directory inode to the transaction.
1966          * We do not do it earlier because xfs_dir_ialloc
1967          * might commit the previous transaction (and release
1968          * all the locks).
1969          */
1970
1971         VN_HOLD(dir_vp);
1972         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1973         dp_joined_to_trans = B_TRUE;
1974
1975         error = XFS_DIR_CREATENAME(mp, tp, dp, name, namelen, ip->i_ino,
1976                 &first_block, &free_list,
1977                 resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1978         if (error) {
1979                 ASSERT(error != ENOSPC);
1980                 goto abort_return;
1981         }
1982         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1983         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1984
1985         /*
1986          * If this is a synchronous mount, make sure that the
1987          * create transaction goes to disk before returning to
1988          * the user.
1989          */
1990         if (mp->m_flags & XFS_MOUNT_WSYNC) {
1991                 xfs_trans_set_sync(tp);
1992         }
1993
1994         dp->i_gen++;
1995
1996         /*
1997          * Attach the dquot(s) to the inodes and modify them incore.
1998          * These ids of the inode couldn't have changed since the new
1999          * inode has been locked ever since it was created.
2000          */
2001         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2002
2003         /*
2004          * xfs_trans_commit normally decrements the vnode ref count
2005          * when it unlocks the inode. Since we want to return the
2006          * vnode to the caller, we bump the vnode ref count now.
2007          */
2008         IHOLD(ip);
2009         vp = XFS_ITOV(ip);
2010
2011         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2012         if (error) {
2013                 xfs_bmap_cancel(&free_list);
2014                 goto abort_rele;
2015         }
2016
2017         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2018         if (error) {
2019                 IRELE(ip);
2020                 tp = NULL;
2021                 goto error_return;
2022         }
2023
2024         XFS_QM_DQRELE(mp, udqp);
2025         XFS_QM_DQRELE(mp, gdqp);
2026
2027         /*
2028          * Propogate the fact that the vnode changed after the
2029          * xfs_inode locks have been released.
2030          */
2031         VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2032
2033         *vpp = vp;
2034
2035         /* Fallthrough to std_return with error = 0  */
2036
2037 std_return:
2038         if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2039                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2040                                                         DM_EVENT_POSTCREATE)) {
2041                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2042                         dir_vp, DM_RIGHT_NULL,
2043                         *vpp ? vp:NULL,
2044                         DM_RIGHT_NULL, name, NULL,
2045                         dm_di_mode, error, 0);
2046         }
2047         return error;
2048
2049  abort_return:
2050         cancel_flags |= XFS_TRANS_ABORT;
2051         /* FALLTHROUGH */
2052  error_return:
2053
2054         if (tp != NULL)
2055                 xfs_trans_cancel(tp, cancel_flags);
2056
2057         if (!dp_joined_to_trans && (dp != NULL))
2058                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2059         XFS_QM_DQRELE(mp, udqp);
2060         XFS_QM_DQRELE(mp, gdqp);
2061
2062         goto std_return;
2063
2064  abort_rele:
2065         /*
2066          * Wait until after the current transaction is aborted to
2067          * release the inode.  This prevents recursive transactions
2068          * and deadlocks from xfs_inactive.
2069          */
2070         cancel_flags |= XFS_TRANS_ABORT;
2071         xfs_trans_cancel(tp, cancel_flags);
2072         IRELE(ip);
2073
2074         XFS_QM_DQRELE(mp, udqp);
2075         XFS_QM_DQRELE(mp, gdqp);
2076
2077         goto std_return;
2078 }
2079
2080 #ifdef DEBUG
2081 /*
2082  * Some counters to see if (and how often) we are hitting some deadlock
2083  * prevention code paths.
2084  */
2085
2086 int xfs_rm_locks;
2087 int xfs_rm_lock_delays;
2088 int xfs_rm_attempts;
2089 #endif
2090
2091 /*
2092  * The following routine will lock the inodes associated with the
2093  * directory and the named entry in the directory. The locks are
2094  * acquired in increasing inode number.
2095  *
2096  * If the entry is "..", then only the directory is locked. The
2097  * vnode ref count will still include that from the .. entry in
2098  * this case.
2099  *
2100  * There is a deadlock we need to worry about. If the locked directory is
2101  * in the AIL, it might be blocking up the log. The next inode we lock
2102  * could be already locked by another thread waiting for log space (e.g
2103  * a permanent log reservation with a long running transaction (see
2104  * xfs_itruncate_finish)). To solve this, we must check if the directory
2105  * is in the ail and use lock_nowait. If we can't lock, we need to
2106  * drop the inode lock on the directory and try again. xfs_iunlock will
2107  * potentially push the tail if we were holding up the log.
2108  */
2109 STATIC int
2110 xfs_lock_dir_and_entry(
2111         xfs_inode_t     *dp,
2112         vname_t         *dentry,
2113         xfs_inode_t     *ip)    /* inode of entry 'name' */
2114 {
2115         int             attempts;
2116         xfs_ino_t       e_inum;
2117         xfs_inode_t     *ips[2];
2118         xfs_log_item_t  *lp;
2119
2120 #ifdef DEBUG
2121         xfs_rm_locks++;
2122 #endif
2123         attempts = 0;
2124
2125 again:
2126         xfs_ilock(dp, XFS_ILOCK_EXCL);
2127
2128         e_inum = ip->i_ino;
2129
2130         ITRACE(ip);
2131
2132         /*
2133          * We want to lock in increasing inum. Since we've already
2134          * acquired the lock on the directory, we may need to release
2135          * if if the inum of the entry turns out to be less.
2136          */
2137         if (e_inum > dp->i_ino) {
2138                 /*
2139                  * We are already in the right order, so just
2140                  * lock on the inode of the entry.
2141                  * We need to use nowait if dp is in the AIL.
2142                  */
2143
2144                 lp = (xfs_log_item_t *)dp->i_itemp;
2145                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2146                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2147                                 attempts++;
2148 #ifdef DEBUG
2149                                 xfs_rm_attempts++;
2150 #endif
2151
2152                                 /*
2153                                  * Unlock dp and try again.
2154                                  * xfs_iunlock will try to push the tail
2155                                  * if the inode is in the AIL.
2156                                  */
2157
2158                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2159
2160                                 if ((attempts % 5) == 0) {
2161                                         delay(1); /* Don't just spin the CPU */
2162 #ifdef DEBUG
2163                                         xfs_rm_lock_delays++;
2164 #endif
2165                                 }
2166                                 goto again;
2167                         }
2168                 } else {
2169                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2170                 }
2171         } else if (e_inum < dp->i_ino) {
2172                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2173
2174                 ips[0] = ip;
2175                 ips[1] = dp;
2176                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2177         }
2178         /* else  e_inum == dp->i_ino */
2179         /*     This can happen if we're asked to lock /x/..
2180          *     the entry is "..", which is also the parent directory.
2181          */
2182
2183         return 0;
2184 }
2185
2186 #ifdef DEBUG
2187 int xfs_locked_n;
2188 int xfs_small_retries;
2189 int xfs_middle_retries;
2190 int xfs_lots_retries;
2191 int xfs_lock_delays;
2192 #endif
2193
2194 /*
2195  * The following routine will lock n inodes in exclusive mode.
2196  * We assume the caller calls us with the inodes in i_ino order.
2197  *
2198  * We need to detect deadlock where an inode that we lock
2199  * is in the AIL and we start waiting for another inode that is locked
2200  * by a thread in a long running transaction (such as truncate). This can
2201  * result in deadlock since the long running trans might need to wait
2202  * for the inode we just locked in order to push the tail and free space
2203  * in the log.
2204  */
2205 void
2206 xfs_lock_inodes(
2207         xfs_inode_t     **ips,
2208         int             inodes,
2209         int             first_locked,
2210         uint            lock_mode)
2211 {
2212         int             attempts = 0, i, j, try_lock;
2213         xfs_log_item_t  *lp;
2214
2215         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2216
2217         if (first_locked) {
2218                 try_lock = 1;
2219                 i = 1;
2220         } else {
2221                 try_lock = 0;
2222                 i = 0;
2223         }
2224
2225 again:
2226         for (; i < inodes; i++) {
2227                 ASSERT(ips[i]);
2228
2229                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2230                         continue;
2231
2232                 /*
2233                  * If try_lock is not set yet, make sure all locked inodes
2234                  * are not in the AIL.
2235                  * If any are, set try_lock to be used later.
2236                  */
2237
2238                 if (!try_lock) {
2239                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2240                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2241                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2242                                         try_lock++;
2243                                 }
2244                         }
2245                 }
2246
2247                 /*
2248                  * If any of the previous locks we have locked is in the AIL,
2249                  * we must TRY to get the second and subsequent locks. If
2250                  * we can't get any, we must release all we have
2251                  * and try again.
2252                  */
2253
2254                 if (try_lock) {
2255                         /* try_lock must be 0 if i is 0. */
2256                         /*
2257                          * try_lock means we have an inode locked
2258                          * that is in the AIL.
2259                          */
2260                         ASSERT(i != 0);
2261                         if (!xfs_ilock_nowait(ips[i], lock_mode)) {
2262                                 attempts++;
2263
2264                                 /*
2265                                  * Unlock all previous guys and try again.
2266                                  * xfs_iunlock will try to push the tail
2267                                  * if the inode is in the AIL.
2268                                  */
2269
2270                                 for(j = i - 1; j >= 0; j--) {
2271
2272                                         /*
2273                                          * Check to see if we've already
2274                                          * unlocked this one.
2275                                          * Not the first one going back,
2276                                          * and the inode ptr is the same.
2277                                          */
2278                                         if ((j != (i - 1)) && ips[j] ==
2279                                                                 ips[j+1])
2280                                                 continue;
2281
2282                                         xfs_iunlock(ips[j], lock_mode);
2283                                 }
2284
2285                                 if ((attempts % 5) == 0) {
2286                                         delay(1); /* Don't just spin the CPU */
2287 #ifdef DEBUG
2288                                         xfs_lock_delays++;
2289 #endif
2290                                 }
2291                                 i = 0;
2292                                 try_lock = 0;
2293                                 goto again;
2294                         }
2295                 } else {
2296                         xfs_ilock(ips[i], lock_mode);
2297                 }
2298         }
2299
2300 #ifdef DEBUG
2301         if (attempts) {
2302                 if (attempts < 5) xfs_small_retries++;
2303                 else if (attempts < 100) xfs_middle_retries++;
2304                 else xfs_lots_retries++;
2305         } else {
2306                 xfs_locked_n++;
2307         }
2308 #endif
2309 }
2310
2311 #ifdef  DEBUG
2312 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2313 int remove_which_error_return = 0;
2314 #else /* ! DEBUG */
2315 #define REMOVE_DEBUG_TRACE(x)
2316 #endif  /* ! DEBUG */
2317
2318
2319 /*
2320  * xfs_remove
2321  *
2322  */
2323 STATIC int
2324 xfs_remove(
2325         bhv_desc_t              *dir_bdp,
2326         vname_t                 *dentry,
2327         cred_t                  *credp)
2328 {
2329         vnode_t                 *dir_vp;
2330         char                    *name = VNAME(dentry);
2331         xfs_inode_t             *dp, *ip;
2332         xfs_trans_t             *tp = NULL;
2333         xfs_mount_t             *mp;
2334         int                     error = 0;
2335         xfs_bmap_free_t         free_list;
2336         xfs_fsblock_t           first_block;
2337         int                     cancel_flags;
2338         int                     committed;
2339         int                     dm_di_mode = 0;
2340         int                     link_zero;
2341         uint                    resblks;
2342         int                     namelen;
2343
2344         dir_vp = BHV_TO_VNODE(dir_bdp);
2345         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2346
2347         dp = XFS_BHVTOI(dir_bdp);
2348         mp = dp->i_mount;
2349
2350         if (XFS_FORCED_SHUTDOWN(mp))
2351                 return XFS_ERROR(EIO);
2352
2353         namelen = VNAMELEN(dentry);
2354
2355         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2356                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2357                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2358                                         name, NULL, 0, 0, 0);
2359                 if (error)
2360                         return error;
2361         }
2362
2363         /* From this point on, return through std_return */
2364         ip = NULL;
2365
2366         /*
2367          * We need to get a reference to ip before we get our log
2368          * reservation. The reason for this is that we cannot call
2369          * xfs_iget for an inode for which we do not have a reference
2370          * once we've acquired a log reservation. This is because the
2371          * inode we are trying to get might be in xfs_inactive going
2372          * for a log reservation. Since we'll have to wait for the
2373          * inactive code to complete before returning from xfs_iget,
2374          * we need to make sure that we don't have log space reserved
2375          * when we call xfs_iget.  Instead we get an unlocked referece
2376          * to the inode before getting our log reservation.
2377          */
2378         error = xfs_get_dir_entry(dentry, &ip);
2379         if (error) {
2380                 REMOVE_DEBUG_TRACE(__LINE__);
2381                 goto std_return;
2382         }
2383
2384         dm_di_mode = ip->i_d.di_mode;
2385
2386         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2387
2388         ITRACE(ip);
2389
2390         error = XFS_QM_DQATTACH(mp, dp, 0);
2391         if (!error && dp != ip)
2392                 error = XFS_QM_DQATTACH(mp, ip, 0);
2393         if (error) {
2394                 REMOVE_DEBUG_TRACE(__LINE__);
2395                 IRELE(ip);
2396                 goto std_return;
2397         }
2398
2399         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2400         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2401         /*
2402          * We try to get the real space reservation first,
2403          * allowing for directory btree deletion(s) implying
2404          * possible bmap insert(s).  If we can't get the space
2405          * reservation then we use 0 instead, and avoid the bmap
2406          * btree insert(s) in the directory code by, if the bmap
2407          * insert tries to happen, instead trimming the LAST
2408          * block from the directory.
2409          */
2410         resblks = XFS_REMOVE_SPACE_RES(mp);
2411         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2412                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2413         if (error == ENOSPC) {
2414                 resblks = 0;
2415                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2416                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2417         }
2418         if (error) {
2419                 ASSERT(error != ENOSPC);
2420                 REMOVE_DEBUG_TRACE(__LINE__);
2421                 xfs_trans_cancel(tp, 0);
2422                 IRELE(ip);
2423                 return error;
2424         }
2425
2426         error = xfs_lock_dir_and_entry(dp, dentry, ip);
2427         if (error) {
2428                 REMOVE_DEBUG_TRACE(__LINE__);
2429                 xfs_trans_cancel(tp, cancel_flags);
2430                 IRELE(ip);
2431                 goto std_return;
2432         }
2433
2434         /*
2435          * At this point, we've gotten both the directory and the entry
2436          * inodes locked.
2437          */
2438         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2439         if (dp != ip) {
2440                 /*
2441                  * Increment vnode ref count only in this case since
2442                  * there's an extra vnode reference in the case where
2443                  * dp == ip.
2444                  */
2445                 IHOLD(dp);
2446                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2447         }
2448
2449         /*
2450          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2451          */
2452         XFS_BMAP_INIT(&free_list, &first_block);
2453         error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, ip->i_ino,
2454                 &first_block, &free_list, 0);
2455         if (error) {
2456                 ASSERT(error != ENOENT);
2457                 REMOVE_DEBUG_TRACE(__LINE__);
2458                 goto error1;
2459         }
2460         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2461
2462         dp->i_gen++;
2463         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2464
2465         error = xfs_droplink(tp, ip);
2466         if (error) {
2467                 REMOVE_DEBUG_TRACE(__LINE__);
2468                 goto error1;
2469         }
2470
2471         /* Determine if this is the last link while
2472          * we are in the transaction.
2473          */
2474         link_zero = (ip)->i_d.di_nlink==0;
2475
2476         /*
2477          * Take an extra ref on the inode so that it doesn't
2478          * go to xfs_inactive() from within the commit.
2479          */
2480         IHOLD(ip);
2481
2482         /*
2483          * If this is a synchronous mount, make sure that the
2484          * remove transaction goes to disk before returning to
2485          * the user.
2486          */
2487         if (mp->m_flags & XFS_MOUNT_WSYNC) {
2488                 xfs_trans_set_sync(tp);
2489         }
2490
2491         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2492         if (error) {
2493                 REMOVE_DEBUG_TRACE(__LINE__);
2494                 goto error_rele;
2495         }
2496
2497         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2498         if (error) {
2499                 IRELE(ip);
2500                 goto std_return;
2501         }
2502
2503         /*
2504          * Before we drop our extra reference to the inode, purge it
2505          * from the refcache if it is there.  By waiting until afterwards
2506          * to do the IRELE, we ensure that we won't go inactive in the
2507          * xfs_refcache_purge_ip routine (although that would be OK).
2508          */
2509         xfs_refcache_purge_ip(ip);
2510
2511         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2512
2513         /*
2514          * Let interposed file systems know about removed links.
2515          */
2516         VOP_LINK_REMOVED(XFS_ITOV(ip), dir_vp, link_zero);
2517
2518         IRELE(ip);
2519
2520 /*      Fall through to std_return with error = 0 */
2521  std_return:
2522         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2523                                                 DM_EVENT_POSTREMOVE)) {
2524                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2525                                 dir_vp, DM_RIGHT_NULL,
2526                                 NULL, DM_RIGHT_NULL,
2527                                 name, NULL, dm_di_mode, error, 0);
2528         }
2529         return error;
2530
2531  error1:
2532         xfs_bmap_cancel(&free_list);
2533         cancel_flags |= XFS_TRANS_ABORT;
2534         xfs_trans_cancel(tp, cancel_flags);
2535         goto std_return;
2536
2537  error_rele:
2538         /*
2539          * In this case make sure to not release the inode until after
2540          * the current transaction is aborted.  Releasing it beforehand
2541          * can cause us to go to xfs_inactive and start a recursive
2542          * transaction which can easily deadlock with the current one.
2543          */
2544         xfs_bmap_cancel(&free_list);
2545         cancel_flags |= XFS_TRANS_ABORT;
2546         xfs_trans_cancel(tp, cancel_flags);
2547
2548         /*
2549          * Before we drop our extra reference to the inode, purge it
2550          * from the refcache if it is there.  By waiting until afterwards
2551          * to do the IRELE, we ensure that we won't go inactive in the
2552          * xfs_refcache_purge_ip routine (although that would be OK).
2553          */
2554         xfs_refcache_purge_ip(ip);
2555
2556         IRELE(ip);
2557
2558         goto std_return;
2559 }
2560
2561
2562 /*
2563  * xfs_link
2564  *
2565  */
2566 STATIC int
2567 xfs_link(
2568         bhv_desc_t              *target_dir_bdp,
2569         vnode_t                 *src_vp,
2570         vname_t                 *dentry,
2571         cred_t                  *credp)
2572 {
2573         xfs_inode_t             *tdp, *sip;
2574         xfs_trans_t             *tp;
2575         xfs_mount_t             *mp;
2576         xfs_inode_t             *ips[2];
2577         int                     error;
2578         xfs_bmap_free_t         free_list;
2579         xfs_fsblock_t           first_block;
2580         int                     cancel_flags;
2581         int                     committed;
2582         vnode_t                 *target_dir_vp;
2583         bhv_desc_t              *src_bdp;
2584         int                     resblks;
2585         char                    *target_name = VNAME(dentry);
2586         int                     target_namelen;
2587
2588         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2589         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2590         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2591
2592         target_namelen = VNAMELEN(dentry);
2593         if (src_vp->v_type == VDIR)
2594                 return XFS_ERROR(EPERM);
2595
2596         /*
2597          * For now, manually find the XFS behavior descriptor for
2598          * the source vnode.  If it doesn't exist then something
2599          * is wrong and we should just return an error.
2600          * Eventually we need to figure out how link is going to
2601          * work in the face of stacked vnodes.
2602          */
2603         src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
2604         if (src_bdp == NULL) {
2605                 return XFS_ERROR(EXDEV);
2606         }
2607         sip = XFS_BHVTOI(src_bdp);
2608         tdp = XFS_BHVTOI(target_dir_bdp);
2609         mp = tdp->i_mount;
2610         if (XFS_FORCED_SHUTDOWN(mp))
2611                 return XFS_ERROR(EIO);
2612
2613         if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2614                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2615                                         target_dir_vp, DM_RIGHT_NULL,
2616                                         src_vp, DM_RIGHT_NULL,
2617                                         target_name, NULL, 0, 0, 0);
2618                 if (error)
2619                         return error;
2620         }
2621
2622         /* Return through std_return after this point. */
2623
2624         error = XFS_QM_DQATTACH(mp, sip, 0);
2625         if (!error && sip != tdp)
2626                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2627         if (error)
2628                 goto std_return;
2629
2630         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2631         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2632         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2633         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2634                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2635         if (error == ENOSPC) {
2636                 resblks = 0;
2637                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2638                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2639         }
2640         if (error) {
2641                 cancel_flags = 0;
2642                 goto error_return;
2643         }
2644
2645         if (sip->i_ino < tdp->i_ino) {
2646                 ips[0] = sip;
2647                 ips[1] = tdp;
2648         } else {
2649                 ips[0] = tdp;
2650                 ips[1] = sip;
2651         }
2652
2653         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2654
2655         /*
2656          * Increment vnode ref counts since xfs_trans_commit &
2657          * xfs_trans_cancel will both unlock the inodes and
2658          * decrement the associated ref counts.
2659          */
2660         VN_HOLD(src_vp);
2661         VN_HOLD(target_dir_vp);
2662         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2663         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2664
2665         /*
2666          * If the source has too many links, we can't make any more to it.
2667          */
2668         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2669                 error = XFS_ERROR(EMLINK);
2670                 goto error_return;
2671         }
2672
2673         if (resblks == 0 &&
2674             (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name,
2675                         target_namelen)))
2676                 goto error_return;
2677
2678         XFS_BMAP_INIT(&free_list, &first_block);
2679
2680         error = XFS_DIR_CREATENAME(mp, tp, tdp, target_name, target_namelen,
2681                                    sip->i_ino, &first_block, &free_list,
2682                                    resblks);
2683         if (error)
2684                 goto abort_return;
2685         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2686         tdp->i_gen++;
2687         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2688
2689         error = xfs_bumplink(tp, sip);
2690         if (error) {
2691                 goto abort_return;
2692         }
2693
2694         /*
2695          * If this is a synchronous mount, make sure that the
2696          * link transaction goes to disk before returning to
2697          * the user.
2698          */
2699         if (mp->m_flags & XFS_MOUNT_WSYNC) {
2700                 xfs_trans_set_sync(tp);
2701         }
2702
2703         error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
2704         if (error) {
2705                 xfs_bmap_cancel(&free_list);
2706                 goto abort_return;
2707         }
2708
2709         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2710         if (error) {
2711                 goto std_return;
2712         }
2713
2714         /* Fall through to std_return with error = 0. */
2715 std_return:
2716         if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2717                                                 DM_EVENT_POSTLINK)) {
2718                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2719                                 target_dir_vp, DM_RIGHT_NULL,
2720                                 src_vp, DM_RIGHT_NULL,
2721                                 target_name, NULL, 0, error, 0);
2722         }
2723         return error;
2724
2725  abort_return:
2726         cancel_flags |= XFS_TRANS_ABORT;
2727         /* FALLTHROUGH */
2728  error_return:
2729         xfs_trans_cancel(tp, cancel_flags);
2730
2731         goto std_return;
2732 }
2733 /*
2734  * xfs_mkdir
2735  *
2736  */
2737 STATIC int
2738 xfs_mkdir(
2739         bhv_desc_t              *dir_bdp,
2740         vname_t                 *dentry,
2741         vattr_t                 *vap,
2742         vnode_t                 **vpp,
2743         cred_t                  *credp)
2744 {
2745         char                    *dir_name = VNAME(dentry);
2746         xfs_inode_t             *dp;
2747         xfs_inode_t             *cdp;   /* inode of created dir */
2748         vnode_t                 *cvp;   /* vnode of created dir */
2749         xfs_trans_t             *tp;
2750         xfs_mount_t             *mp;
2751         int                     cancel_flags;
2752         int                     error;
2753         int                     committed;
2754         xfs_bmap_free_t         free_list;
2755         xfs_fsblock_t           first_block;
2756         vnode_t                 *dir_vp;
2757         boolean_t               dp_joined_to_trans;
2758         boolean_t               created = B_FALSE;
2759         int                     dm_event_sent = 0;
2760         xfs_prid_t              prid;
2761         struct xfs_dquot        *udqp, *gdqp;
2762         uint                    resblks;
2763         int                     dm_di_mode;
2764         int                     dir_namelen;
2765
2766         dir_vp = BHV_TO_VNODE(dir_bdp);
2767         dp = XFS_BHVTOI(dir_bdp);
2768         mp = dp->i_mount;
2769
2770         if (XFS_FORCED_SHUTDOWN(mp))
2771                 return XFS_ERROR(EIO);
2772
2773         dir_namelen = VNAMELEN(dentry);
2774
2775         tp = NULL;
2776         dp_joined_to_trans = B_FALSE;
2777         dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
2778
2779         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2780                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2781                                         dir_vp, DM_RIGHT_NULL, NULL,
2782                                         DM_RIGHT_NULL, dir_name, NULL,
2783                                         dm_di_mode, 0, 0);
2784                 if (error)
2785                         return error;
2786                 dm_event_sent = 1;
2787         }
2788
2789         /* Return through std_return after this point. */
2790
2791         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2792
2793         mp = dp->i_mount;
2794         udqp = gdqp = NULL;
2795         if (vap->va_mask & XFS_AT_PROJID)
2796                 prid = (xfs_prid_t)vap->va_projid;
2797         else
2798                 prid = (xfs_prid_t)dfltprid;
2799
2800         /*
2801          * Make sure that we have allocated dquot(s) on disk.
2802          */
2803         error = XFS_QM_DQVOPALLOC(mp, dp,
2804                         current_fsuid(credp), current_fsgid(credp),
2805                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2806         if (error)
2807                 goto std_return;
2808
2809         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2810         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2811         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2812         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2813                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2814         if (error == ENOSPC) {
2815                 resblks = 0;
2816                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2817                                           XFS_TRANS_PERM_LOG_RES,
2818                                           XFS_MKDIR_LOG_COUNT);
2819         }
2820         if (error) {
2821                 cancel_flags = 0;
2822                 dp = NULL;
2823                 goto error_return;
2824         }
2825
2826         xfs_ilock(dp, XFS_ILOCK_EXCL);
2827
2828         /*
2829          * Check for directory link count overflow.
2830          */
2831         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2832                 error = XFS_ERROR(EMLINK);
2833                 goto error_return;
2834         }
2835
2836         /*
2837          * Reserve disk quota and the inode.
2838          */
2839         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2840         if (error)
2841                 goto error_return;
2842
2843         if (resblks == 0 &&
2844             (error = XFS_DIR_CANENTER(mp, tp, dp, dir_name, dir_namelen)))
2845                 goto error_return;
2846         /*
2847          * create the directory inode.
2848          */
2849         error = xfs_dir_ialloc(&tp, dp,
2850                         MAKEIMODE(vap->va_type,vap->va_mode), 2,
2851                         0, credp, prid, resblks > 0,
2852                 &cdp, NULL);
2853         if (error) {
2854                 if (error == ENOSPC)
2855                         goto error_return;
2856                 goto abort_return;
2857         }
2858         ITRACE(cdp);
2859
2860         /*
2861          * Now we add the directory inode to the transaction.
2862          * We waited until now since xfs_dir_ialloc might start
2863          * a new transaction.  Had we joined the transaction
2864          * earlier, the locks might have gotten released.
2865          */
2866         VN_HOLD(dir_vp);
2867         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2868         dp_joined_to_trans = B_TRUE;
2869
2870         XFS_BMAP_INIT(&free_list, &first_block);
2871
2872         error = XFS_DIR_CREATENAME(mp, tp, dp, dir_name, dir_namelen,
2873                         cdp->i_ino, &first_block, &free_list,
2874                         resblks ? resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2875         if (error) {
2876                 ASSERT(error != ENOSPC);
2877                 goto error1;
2878         }
2879         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2880
2881         /*
2882          * Bump the in memory version number of the parent directory
2883          * so that other processes accessing it will recognize that
2884          * the directory has changed.
2885          */
2886         dp->i_gen++;
2887
2888         error = XFS_DIR_INIT(mp, tp, cdp, dp);
2889         if (error) {
2890                 goto error2;
2891         }
2892
2893         cdp->i_gen = 1;
2894         error = xfs_bumplink(tp, dp);
2895         if (error) {
2896                 goto error2;
2897         }
2898
2899         cvp = XFS_ITOV(cdp);
2900
2901         created = B_TRUE;
2902
2903         *vpp = cvp;
2904         IHOLD(cdp);
2905
2906         /*
2907          * Attach the dquots to the new inode and modify the icount incore.
2908          */
2909         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2910
2911         /*
2912          * If this is a synchronous mount, make sure that the
2913          * mkdir transaction goes to disk before returning to
2914          * the user.
2915          */
2916         if (mp->m_flags & XFS_MOUNT_WSYNC) {
2917                 xfs_trans_set_sync(tp);
2918         }
2919
2920         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
2921         if (error) {
2922                 IRELE(cdp);
2923                 goto error2;
2924         }
2925
2926         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
2927         XFS_QM_DQRELE(mp, udqp);
2928         XFS_QM_DQRELE(mp, gdqp);
2929         if (error) {
2930                 IRELE(cdp);
2931         }
2932
2933         /* Fall through to std_return with error = 0 or errno from
2934          * xfs_trans_commit. */
2935
2936 std_return:
2937         if ( (created || (error != 0 && dm_event_sent != 0)) &&
2938                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2939                                                 DM_EVENT_POSTCREATE)) {
2940                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2941                                         dir_vp, DM_RIGHT_NULL,
2942                                         created ? XFS_ITOV(cdp):NULL,
2943                                         DM_RIGHT_NULL,
2944                                         dir_name, NULL,
2945                                         dm_di_mode, error, 0);
2946         }
2947         return error;
2948
2949  error2:
2950  error1:
2951         xfs_bmap_cancel(&free_list);
2952  abort_return:
2953         cancel_flags |= XFS_TRANS_ABORT;
2954  error_return:
2955         xfs_trans_cancel(tp, cancel_flags);
2956         XFS_QM_DQRELE(mp, udqp);
2957         XFS_QM_DQRELE(mp, gdqp);
2958
2959         if (!dp_joined_to_trans && (dp != NULL)) {
2960                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2961         }
2962
2963         goto std_return;
2964 }
2965
2966
2967 /*
2968  * xfs_rmdir
2969  *
2970  */
2971 STATIC int
2972 xfs_rmdir(
2973         bhv_desc_t              *dir_bdp,
2974         vname_t                 *dentry,
2975         cred_t                  *credp)
2976 {
2977         char                    *name = VNAME(dentry);
2978         xfs_inode_t             *dp;
2979         xfs_inode_t             *cdp;   /* child directory */
2980         xfs_trans_t             *tp;
2981         xfs_mount_t             *mp;
2982         int                     error;
2983         xfs_bmap_free_t         free_list;
2984         xfs_fsblock_t           first_block;
2985         int                     cancel_flags;
2986         int                     committed;
2987         vnode_t                 *dir_vp;
2988         int                     dm_di_mode = 0;
2989         int                     last_cdp_link;
2990         int                     namelen;
2991         uint                    resblks;
2992
2993         dir_vp = BHV_TO_VNODE(dir_bdp);
2994         dp = XFS_BHVTOI(dir_bdp);
2995         mp = dp->i_mount;
2996
2997         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2998
2999         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3000                 return XFS_ERROR(EIO);
3001         namelen = VNAMELEN(dentry);
3002
3003         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3004                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3005                                         dir_vp, DM_RIGHT_NULL,
3006                                         NULL, DM_RIGHT_NULL,
3007                                         name, NULL, 0, 0, 0);
3008                 if (error)
3009                         return XFS_ERROR(error);
3010         }
3011
3012         /* Return through std_return after this point. */
3013
3014         cdp = NULL;
3015
3016         /*
3017          * We need to get a reference to cdp before we get our log
3018          * reservation.  The reason for this is that we cannot call
3019          * xfs_iget for an inode for which we do not have a reference
3020          * once we've acquired a log reservation.  This is because the
3021          * inode we are trying to get might be in xfs_inactive going
3022          * for a log reservation.  Since we'll have to wait for the
3023          * inactive code to complete before returning from xfs_iget,
3024          * we need to make sure that we don't have log space reserved
3025          * when we call xfs_iget.  Instead we get an unlocked referece
3026          * to the inode before getting our log reservation.
3027          */
3028         error = xfs_get_dir_entry(dentry, &cdp);
3029         if (error) {
3030                 REMOVE_DEBUG_TRACE(__LINE__);
3031                 goto std_return;
3032         }
3033         mp = dp->i_mount;
3034         dm_di_mode = cdp->i_d.di_mode;
3035
3036         /*
3037          * Get the dquots for the inodes.
3038          */
3039         error = XFS_QM_DQATTACH(mp, dp, 0);
3040         if (!error && dp != cdp)
3041                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3042         if (error) {
3043                 IRELE(cdp);
3044                 REMOVE_DEBUG_TRACE(__LINE__);
3045                 goto std_return;
3046         }
3047
3048         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3049         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3050         /*
3051          * We try to get the real space reservation first,
3052          * allowing for directory btree deletion(s) implying
3053          * possible bmap insert(s).  If we can't get the space
3054          * reservation then we use 0 instead, and avoid the bmap
3055          * btree insert(s) in the directory code by, if the bmap
3056          * insert tries to happen, instead trimming the LAST
3057          * block from the directory.
3058          */
3059         resblks = XFS_REMOVE_SPACE_RES(mp);
3060         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3061                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3062         if (error == ENOSPC) {
3063                 resblks = 0;
3064                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3065                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3066         }
3067         if (error) {
3068                 ASSERT(error != ENOSPC);
3069                 cancel_flags = 0;
3070                 IRELE(cdp);
3071                 goto error_return;
3072         }
3073         XFS_BMAP_INIT(&free_list, &first_block);
3074
3075         /*
3076          * Now lock the child directory inode and the parent directory
3077          * inode in the proper order.  This will take care of validating
3078          * that the directory entry for the child directory inode has
3079          * not changed while we were obtaining a log reservation.
3080          */
3081         error = xfs_lock_dir_and_entry(dp, dentry, cdp);
3082         if (error) {
3083                 xfs_trans_cancel(tp, cancel_flags);
3084                 IRELE(cdp);
3085                 goto std_return;
3086         }
3087
3088         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3089         if (dp != cdp) {
3090                 /*
3091                  * Only increment the parent directory vnode count if
3092                  * we didn't bump it in looking up cdp.  The only time
3093                  * we don't bump it is when we're looking up ".".
3094                  */
3095                 VN_HOLD(dir_vp);
3096         }
3097
3098         ITRACE(cdp);
3099         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3100
3101         ASSERT(cdp->i_d.di_nlink >= 2);
3102         if (cdp->i_d.di_nlink != 2) {
3103                 error = XFS_ERROR(ENOTEMPTY);
3104                 goto error_return;
3105         }
3106         if (!XFS_DIR_ISEMPTY(mp, cdp)) {
3107                 error = XFS_ERROR(ENOTEMPTY);
3108                 goto error_return;
3109         }
3110
3111         error = XFS_DIR_REMOVENAME(mp, tp, dp, name, namelen, cdp->i_ino,
3112                 &first_block, &free_list, resblks);
3113         if (error) {
3114                 goto error1;
3115         }
3116
3117         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3118
3119         /*
3120          * Bump the in memory generation count on the parent
3121          * directory so that other can know that it has changed.
3122          */
3123         dp->i_gen++;
3124
3125         /*
3126          * Drop the link from cdp's "..".
3127          */
3128         error = xfs_droplink(tp, dp);
3129         if (error) {
3130                 goto error1;
3131         }
3132
3133         /*
3134          * Drop the link from dp to cdp.
3135          */
3136         error = xfs_droplink(tp, cdp);
3137         if (error) {
3138                 goto error1;
3139         }
3140
3141         /*
3142          * Drop the "." link from cdp to self.
3143          */
3144         error = xfs_droplink(tp, cdp);
3145         if (error) {
3146                 goto error1;
3147         }
3148
3149         /* Determine these before committing transaction */
3150         last_cdp_link = (cdp)->i_d.di_nlink==0;
3151
3152         /*
3153          * Take an extra ref on the child vnode so that it
3154          * does not go to xfs_inactive() from within the commit.
3155          */
3156         IHOLD(cdp);
3157
3158         /*
3159          * If this is a synchronous mount, make sure that the
3160          * rmdir transaction goes to disk before returning to
3161          * the user.
3162          */
3163         if (mp->m_flags & XFS_MOUNT_WSYNC) {
3164                 xfs_trans_set_sync(tp);
3165         }
3166
3167         error = xfs_bmap_finish (&tp, &free_list, first_block, &committed);
3168         if (error) {
3169                 xfs_bmap_cancel(&free_list);
3170                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3171                                  XFS_TRANS_ABORT));
3172                 IRELE(cdp);
3173                 goto std_return;
3174         }
3175
3176         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3177         if (error) {
3178                 IRELE(cdp);
3179                 goto std_return;
3180         }
3181
3182
3183         /*
3184          * Let interposed file systems know about removed links.
3185          */
3186         VOP_LINK_REMOVED(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3187
3188         IRELE(cdp);
3189
3190         /* Fall through to std_return with error = 0 or the errno
3191          * from xfs_trans_commit. */
3192 std_return:
3193         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3194                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3195                                         dir_vp, DM_RIGHT_NULL,
3196                                         NULL, DM_RIGHT_NULL,
3197                                         name, NULL, dm_di_mode,
3198                                         error, 0);
3199         }
3200         return error;
3201
3202  error1:
3203         xfs_bmap_cancel(&free_list);
3204         cancel_flags |= XFS_TRANS_ABORT;
3205  error_return:
3206         xfs_trans_cancel(tp, cancel_flags);
3207         goto std_return;
3208 }
3209
3210
3211 /*
3212  * xfs_readdir
3213  *
3214  * Read dp's entries starting at uiop->uio_offset and translate them into
3215  * bufsize bytes worth of struct dirents starting at bufbase.
3216  */
3217 STATIC int
3218 xfs_readdir(
3219         bhv_desc_t      *dir_bdp,
3220         uio_t           *uiop,
3221         cred_t          *credp,
3222         int             *eofp)
3223 {
3224         xfs_inode_t     *dp;
3225         xfs_trans_t     *tp = NULL;
3226         int             error = 0;
3227         uint            lock_mode;
3228         xfs_off_t       start_offset;
3229
3230         vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3231                                                (inst_t *)__return_address);
3232         dp = XFS_BHVTOI(dir_bdp);
3233
3234         if (XFS_FORCED_SHUTDOWN(dp->i_mount)) {
3235                 return XFS_ERROR(EIO);
3236         }
3237
3238         lock_mode = xfs_ilock_map_shared(dp);
3239         start_offset = uiop->uio_offset;
3240         error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp);
3241         if (start_offset != uiop->uio_offset) {
3242                 xfs_ichgtime(dp, XFS_ICHGTIME_ACC);
3243         }
3244         xfs_iunlock_map_shared(dp, lock_mode);
3245         return error;
3246 }
3247
3248
3249 /*
3250  * xfs_symlink
3251  *
3252  */
3253 STATIC int
3254 xfs_symlink(
3255         bhv_desc_t              *dir_bdp,
3256         vname_t                 *dentry,
3257         vattr_t                 *vap,
3258         char                    *target_path,
3259         vnode_t                 **vpp,
3260         cred_t                  *credp)
3261 {
3262         xfs_trans_t             *tp;
3263         xfs_mount_t             *mp;
3264         xfs_inode_t             *dp;
3265         xfs_inode_t             *ip;
3266         int                     error;
3267         int                     pathlen;
3268         xfs_bmap_free_t         free_list;
3269         xfs_fsblock_t           first_block;
3270         boolean_t               dp_joined_to_trans;
3271         vnode_t                 *dir_vp;
3272         uint                    cancel_flags;
3273         int                     committed;
3274         xfs_fileoff_t           first_fsb;
3275         xfs_filblks_t           fs_blocks;
3276         int                     nmaps;
3277         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3278         xfs_daddr_t             d;
3279         char                    *cur_chunk;
3280         int                     byte_cnt;
3281         int                     n;
3282         xfs_buf_t               *bp;
3283         xfs_prid_t              prid;
3284         struct xfs_dquot        *udqp, *gdqp;
3285         uint                    resblks;
3286         char                    *link_name = VNAME(dentry);
3287         int                     link_namelen;
3288
3289         *vpp = NULL;
3290         dir_vp = BHV_TO_VNODE(dir_bdp);
3291         dp = XFS_BHVTOI(dir_bdp);
3292         dp_joined_to_trans = B_FALSE;
3293         error = 0;
3294         ip = NULL;
3295         tp = NULL;
3296
3297         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3298
3299         mp = dp->i_mount;
3300
3301         if (XFS_FORCED_SHUTDOWN(mp))
3302                 return XFS_ERROR(EIO);
3303
3304         link_namelen = VNAMELEN(dentry);
3305
3306         /*
3307          * Check component lengths of the target path name.
3308          */
3309         pathlen = strlen(target_path);
3310         if (pathlen >= MAXPATHLEN)      /* total string too long */
3311                 return XFS_ERROR(ENAMETOOLONG);
3312         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3313                 int len, total;
3314                 char *path;
3315
3316                 for(total = 0, path = target_path; total < pathlen;) {
3317                         /*
3318                          * Skip any slashes.
3319                          */
3320                         while(*path == '/') {
3321                                 total++;
3322                                 path++;
3323                         }
3324
3325                         /*
3326                          * Count up to the next slash or end of path.
3327                          * Error out if the component is bigger than MAXNAMELEN.
3328                          */
3329                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3330                                 if (++len >= MAXNAMELEN) {
3331                                         error = ENAMETOOLONG;
3332                                         return error;
3333                                 }
3334                         }
3335                 }
3336         }
3337
3338         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3339                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3340                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3341                                         link_name, target_path, 0, 0, 0);
3342                 if (error)
3343                         return error;
3344         }
3345
3346         /* Return through std_return after this point. */
3347
3348         udqp = gdqp = NULL;
3349         if (vap->va_mask & XFS_AT_PROJID)
3350                 prid = (xfs_prid_t)vap->va_projid;
3351         else
3352                 prid = (xfs_prid_t)dfltprid;
3353
3354         /*
3355          * Make sure that we have allocated dquot(s) on disk.
3356          */
3357         error = XFS_QM_DQVOPALLOC(mp, dp,
3358                         current_fsuid(credp), current_fsgid(credp),
3359                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3360         if (error)
3361                 goto std_return;
3362
3363         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3364         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3365         /*
3366          * The symlink will fit into the inode data fork?
3367          * There can't be any attributes so we get the whole variable part.
3368          */
3369         if (pathlen <= XFS_LITINO(mp))
3370                 fs_blocks = 0;
3371         else
3372                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3373         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3374         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3375                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3376         if (error == ENOSPC && fs_blocks == 0) {
3377                 resblks = 0;
3378                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3379                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3380         }
3381         if (error) {
3382                 cancel_flags = 0;
3383                 dp = NULL;
3384                 goto error_return;
3385         }
3386
3387         xfs_ilock(dp, XFS_ILOCK_EXCL);
3388
3389         /*
3390          * Reserve disk quota : blocks and inode.
3391          */
3392         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3393         if (error)
3394                 goto error_return;
3395
3396         /*
3397          * Check for ability to enter directory entry, if no space reserved.
3398          */
3399         if (resblks == 0 &&
3400             (error = XFS_DIR_CANENTER(mp, tp, dp, link_name, link_namelen)))
3401                 goto error_return;
3402         /*
3403          * Initialize the bmap freelist prior to calling either
3404          * bmapi or the directory create code.
3405          */
3406         XFS_BMAP_INIT(&free_list, &first_block);
3407
3408         /*
3409          * Allocate an inode for the symlink.
3410          */
3411         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3412                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3413         if (error) {
3414                 if (error == ENOSPC)
3415                         goto error_return;
3416                 goto error1;
3417         }
3418         ITRACE(ip);
3419
3420         VN_HOLD(dir_vp);
3421         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3422         dp_joined_to_trans = B_TRUE;
3423
3424         /*
3425          * Also attach the dquot(s) to it, if applicable.
3426          */
3427         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3428
3429         if (resblks)
3430                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3431         /*
3432          * If the symlink will fit into the inode, write it inline.
3433          */
3434         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3435                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3436                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3437                 ip->i_d.di_size = pathlen;
3438
3439                 /*
3440                  * The inode was initially created in extent format.
3441                  */
3442                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3443                 ip->i_df.if_flags |= XFS_IFINLINE;
3444
3445                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3446                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3447
3448         } else {
3449                 first_fsb = 0;
3450                 nmaps = SYMLINK_MAPS;
3451
3452                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3453                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3454                                   &first_block, resblks, mval, &nmaps,
3455                                   &free_list);
3456                 if (error) {
3457                         goto error1;
3458                 }
3459
3460                 if (resblks)
3461                         resblks -= fs_blocks;
3462                 ip->i_d.di_size = pathlen;
3463                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3464
3465                 cur_chunk = target_path;
3466                 for (n = 0; n < nmaps; n++) {
3467                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3468                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3469                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3470                                                BTOBB(byte_cnt), 0);
3471                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3472                         if (pathlen < byte_cnt) {
3473                                 byte_cnt = pathlen;
3474                         }
3475                         pathlen -= byte_cnt;
3476
3477                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3478                         cur_chunk += byte_cnt;
3479
3480                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3481                 }
3482         }
3483
3484         /*
3485          * Create the directory entry for the symlink.
3486          */
3487         error = XFS_DIR_CREATENAME(mp, tp, dp, link_name, link_namelen,
3488                         ip->i_ino, &first_block, &free_list, resblks);
3489         if (error) {
3490                 goto error1;
3491         }
3492         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3493         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3494
3495         /*
3496          * Bump the in memory version number of the parent directory
3497          * so that other processes accessing it will recognize that
3498          * the directory has changed.
3499          */
3500         dp->i_gen++;
3501
3502         /*
3503          * If this is a synchronous mount, make sure that the
3504          * symlink transaction goes to disk before returning to
3505          * the user.
3506          */
3507         if (mp->m_flags & XFS_MOUNT_WSYNC) {
3508                 xfs_trans_set_sync(tp);
3509         }
3510
3511         /*
3512          * xfs_trans_commit normally decrements the vnode ref count
3513          * when it unlocks the inode. Since we want to return the
3514          * vnode to the caller, we bump the vnode ref count now.
3515          */
3516         IHOLD(ip);
3517
3518         error = xfs_bmap_finish(&tp, &free_list, first_block, &committed);
3519         if (error) {
3520                 goto error2;
3521         }
3522         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
3523         XFS_QM_DQRELE(mp, udqp);
3524         XFS_QM_DQRELE(mp, gdqp);
3525
3526         /* Fall through to std_return with error = 0 or errno from
3527          * xfs_trans_commit     */
3528 std_return:
3529         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3530                              DM_EVENT_POSTSYMLINK)) {
3531                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3532                                         dir_vp, DM_RIGHT_NULL,
3533                                         error ? NULL : XFS_ITOV(ip),
3534                                         DM_RIGHT_NULL, link_name, target_path,
3535                                         0, error, 0);
3536         }
3537
3538         if (!error) {
3539                 vnode_t *vp;
3540
3541                 ASSERT(ip);
3542                 vp = XFS_ITOV(ip);
3543                 *vpp = vp;
3544         }
3545         return error;
3546
3547  error2:
3548         IRELE(ip);
3549  error1:
3550         xfs_bmap_cancel(&free_list);
3551         cancel_flags |= XFS_TRANS_ABORT;
3552  error_return:
3553         xfs_trans_cancel(tp, cancel_flags);
3554         XFS_QM_DQRELE(mp, udqp);
3555         XFS_QM_DQRELE(mp, gdqp);
3556
3557         if (!dp_joined_to_trans && (dp != NULL)) {
3558                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3559         }
3560
3561         goto std_return;
3562 }
3563
3564
3565 /*
3566  * xfs_fid2
3567  *
3568  * A fid routine that takes a pointer to a previously allocated
3569  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3570  */
3571 STATIC int
3572 xfs_fid2(
3573         bhv_desc_t      *bdp,
3574         fid_t           *fidp)
3575 {
3576         xfs_inode_t     *ip;
3577         xfs_fid2_t      *xfid;
3578
3579         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3580                                        (inst_t *)__return_address);
3581         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3582
3583         xfid = (xfs_fid2_t *)fidp;
3584         ip = XFS_BHVTOI(bdp);
3585         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3586         xfid->fid_pad = 0;
3587         /*
3588          * use memcpy because the inode is a long long and there's no
3589          * assurance that xfid->fid_ino is properly aligned.
3590          */
3591         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3592         xfid->fid_gen = ip->i_d.di_gen;
3593
3594         return 0;
3595 }
3596
3597
3598 /*
3599  * xfs_rwlock
3600  */
3601 int
3602 xfs_rwlock(
3603         bhv_desc_t      *bdp,
3604         vrwlock_t       locktype)
3605 {
3606         xfs_inode_t     *ip;
3607         vnode_t         *vp;
3608
3609         vp = BHV_TO_VNODE(bdp);
3610         if (vp->v_type == VDIR)
3611                 return 1;
3612         ip = XFS_BHVTOI(bdp);
3613         if (locktype == VRWLOCK_WRITE) {
3614                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3615         } else if (locktype == VRWLOCK_TRY_READ) {
3616                 return (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED));
3617         } else if (locktype == VRWLOCK_TRY_WRITE) {
3618                 return (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL));
3619         } else {
3620                 ASSERT((locktype == VRWLOCK_READ) ||
3621                        (locktype == VRWLOCK_WRITE_DIRECT));
3622                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3623         }
3624
3625         return 1;
3626 }
3627
3628
3629 /*
3630  * xfs_rwunlock
3631  */
3632 void
3633 xfs_rwunlock(
3634         bhv_desc_t      *bdp,
3635         vrwlock_t       locktype)
3636 {
3637         xfs_inode_t     *ip;
3638         vnode_t         *vp;
3639
3640         vp = BHV_TO_VNODE(bdp);
3641         if (vp->v_type == VDIR)
3642                 return;
3643         ip = XFS_BHVTOI(bdp);
3644         if (locktype == VRWLOCK_WRITE) {
3645                 /*
3646                  * In the write case, we may have added a new entry to
3647                  * the reference cache.  This might store a pointer to
3648                  * an inode to be released in this inode.  If it is there,
3649                  * clear the pointer and release the inode after unlocking
3650                  * this one.
3651                  */
3652                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3653         } else {
3654                 ASSERT((locktype == VRWLOCK_READ) ||
3655                        (locktype == VRWLOCK_WRITE_DIRECT));
3656                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3657         }
3658         return;
3659 }
3660
3661 STATIC int
3662 xfs_inode_flush(
3663         bhv_desc_t      *bdp,
3664         int             flags)
3665 {
3666         xfs_inode_t     *ip;
3667         xfs_mount_t     *mp;
3668         int             error = 0;
3669
3670         ip = XFS_BHVTOI(bdp);
3671         mp = ip->i_mount;
3672
3673         if (XFS_FORCED_SHUTDOWN(mp))
3674                 return XFS_ERROR(EIO);
3675
3676         /* Bypass inodes which have already been cleaned by
3677          * the inode flush clustering code inside xfs_iflush
3678          */
3679         if ((ip->i_update_core == 0) &&
3680             ((ip->i_itemp == NULL) ||
3681              !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)))
3682                 return 0;
3683
3684         if (flags & FLUSH_LOG) {
3685                 xfs_inode_log_item_t *iip = ip->i_itemp;
3686
3687                 if (iip && iip->ili_last_lsn) {
3688                         xlog_t  *log = mp->m_log;
3689                         xfs_lsn_t       sync_lsn;
3690                         int             s, log_flags = XFS_LOG_FORCE;
3691
3692                         s = GRANT_LOCK(log);
3693                         sync_lsn = log->l_last_sync_lsn;
3694                         GRANT_UNLOCK(log, s);
3695
3696                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0))
3697                                 return 0;
3698
3699                         if (flags & FLUSH_SYNC)
3700                                 log_flags |= XFS_LOG_SYNC;
3701                         return xfs_log_force(mp, iip->ili_last_lsn,
3702                                                 log_flags);
3703                 }
3704         }
3705
3706         /* We make this non-blocking if the inode is contended,
3707          * return EAGAIN to indicate to the caller that they
3708          * did not succeed. This prevents the flush path from
3709          * blocking on inodes inside another operation right
3710          * now, they get caught later by xfs_sync.
3711          */
3712         if (flags & FLUSH_INODE) {
3713                 int     flush_flags;
3714
3715                 if (xfs_ipincount(ip))
3716                         return EAGAIN;
3717
3718                 if (flags & FLUSH_SYNC) {
3719                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3720                         xfs_iflock(ip);
3721                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3722                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3723                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3724                                 return EAGAIN;
3725                         }
3726                 } else {
3727                         return EAGAIN;
3728                 }
3729
3730                 if (flags & FLUSH_SYNC)
3731                         flush_flags = XFS_IFLUSH_SYNC;
3732                 else
3733                         flush_flags = XFS_IFLUSH_ASYNC;
3734
3735                 error = xfs_iflush(ip, flush_flags);
3736                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3737         }
3738
3739         return error;
3740 }
3741
3742
3743 int
3744 xfs_set_dmattrs (
3745         bhv_desc_t      *bdp,
3746         u_int           evmask,
3747         u_int16_t       state,
3748         cred_t          *credp)
3749 {
3750         xfs_inode_t     *ip;
3751         xfs_trans_t     *tp;
3752         xfs_mount_t     *mp;
3753         int             error;
3754
3755         if (!capable(CAP_SYS_ADMIN))
3756                 return XFS_ERROR(EPERM);
3757
3758         ip = XFS_BHVTOI(bdp);
3759         mp = ip->i_mount;
3760
3761         if (XFS_FORCED_SHUTDOWN(mp))
3762                 return XFS_ERROR(EIO);
3763
3764         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3765         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3766         if (error) {
3767                 xfs_trans_cancel(tp, 0);
3768                 return error;
3769         }
3770         xfs_ilock(ip, XFS_ILOCK_EXCL);
3771         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3772
3773         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3774         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3775
3776         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3777         IHOLD(ip);
3778         error = xfs_trans_commit(tp, 0, NULL);
3779
3780         return error;
3781 }
3782
3783
3784 /*
3785  * xfs_reclaim
3786  */
3787 STATIC int
3788 xfs_reclaim(
3789         bhv_desc_t      *bdp)
3790 {
3791         xfs_inode_t     *ip;
3792         vnode_t         *vp;
3793
3794         vp = BHV_TO_VNODE(bdp);
3795
3796         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3797
3798         ASSERT(!VN_MAPPED(vp));
3799         ip = XFS_BHVTOI(bdp);
3800
3801         if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
3802                 if (ip->i_d.di_size > 0) {
3803                         /*
3804                          * Flush and invalidate any data left around that is
3805                          * a part of this file.
3806                          *
3807                          * Get the inode's i/o lock so that buffers are pushed
3808                          * out while holding the proper lock.  We can't hold
3809                          * the inode lock here since flushing out buffers may
3810                          * cause us to try to get the lock in xfs_strategy().
3811                          *
3812                          * We don't have to call remapf() here, because there
3813                          * cannot be any mapped file references to this vnode
3814                          * since it is being reclaimed.
3815                          */
3816                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
3817
3818                         /*
3819                          * If we hit an IO error, we need to make sure that the
3820                          * buffer and page caches of file data for
3821                          * the file are tossed away. We don't want to use
3822                          * VOP_FLUSHINVAL_PAGES here because we don't want dirty
3823                          * pages to stay attached to the vnode, but be
3824                          * marked P_BAD. pdflush/vnode_pagebad
3825                          * hates that.
3826                          */
3827                         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3828                                 VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE);
3829                         } else {
3830                                 VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
3831                         }
3832
3833                         ASSERT(VN_CACHED(vp) == 0);
3834                         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
3835                                ip->i_delayed_blks == 0);
3836                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
3837                 } else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3838                         /*
3839                          * di_size field may not be quite accurate if we're
3840                          * shutting down.
3841                          */
3842                         VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
3843                         ASSERT(VN_CACHED(vp) == 0);
3844                 }
3845         }
3846
3847         /* If we have nothing to flush with this inode then complete the
3848          * teardown now, otherwise break the link between the xfs inode
3849          * and the linux inode and clean up the xfs inode later. This
3850          * avoids flushing the inode to disk during the delete operation
3851          * itself.
3852          */
3853         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3854                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3855                 xfs_iflock(ip);
3856                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3857         } else {
3858                 xfs_mount_t     *mp = ip->i_mount;
3859
3860                 /* Protect sync from us */
3861                 XFS_MOUNT_ILOCK(mp);
3862                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3863                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3864                 ip->i_flags |= XFS_IRECLAIMABLE;
3865                 XFS_MOUNT_IUNLOCK(mp);
3866         }
3867         return 0;
3868 }
3869
3870 int
3871 xfs_finish_reclaim(
3872         xfs_inode_t     *ip,
3873         int             locked,
3874         int             sync_mode)
3875 {
3876         xfs_ihash_t     *ih = ip->i_hash;
3877         int             error;
3878
3879         /* The hash lock here protects a thread in xfs_iget_core from
3880          * racing with us on linking the inode back with a vnode.
3881          * Once we have the XFS_IRECLAIM flag set it will not touch
3882          * us.
3883          */
3884         write_lock(&ih->ih_lock);
3885         if ((ip->i_flags & XFS_IRECLAIM) ||
3886             (!(ip->i_flags & XFS_IRECLAIMABLE) &&
3887               (XFS_ITOV_NULL(ip) == NULL))) {
3888                 write_unlock(&ih->ih_lock);
3889                 if (locked) {
3890                         xfs_ifunlock(ip);
3891                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3892                 }
3893                 return(1);
3894         }
3895         ip->i_flags |= XFS_IRECLAIM;
3896         write_unlock(&ih->ih_lock);
3897
3898         /*
3899          * If the inode is still dirty, then flush it out.  If the inode
3900          * is not in the AIL, then it will be OK to flush it delwri as
3901          * long as xfs_iflush() does not keep any references to the inode.
3902          * We leave that decision up to xfs_iflush() since it has the
3903          * knowledge of whether it's OK to simply do a delwri flush of
3904          * the inode or whether we need to wait until the inode is
3905          * pulled from the AIL.
3906          * We get the flush lock regardless, though, just to make sure
3907          * we don't free it while it is being flushed.
3908          */
3909         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3910                 if (!locked) {
3911                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3912                         xfs_iflock(ip);
3913                 }
3914
3915                 if (ip->i_update_core ||
3916                     ((ip->i_itemp != NULL) &&
3917                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3918                         error = xfs_iflush(ip, sync_mode);
3919                         /*
3920                          * If we hit an error, typically because of filesystem
3921                          * shutdown, we don't need to let vn_reclaim to know
3922                          * because we're gonna reclaim the inode anyway.
3923                          */
3924                         if (error) {
3925                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3926                                 xfs_ireclaim(ip);
3927                                 return (0);
3928                         }
3929                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3930                 }
3931
3932                 ASSERT(ip->i_update_core == 0);
3933                 ASSERT(ip->i_itemp == NULL ||
3934                        ip->i_itemp->ili_format.ilf_fields == 0);
3935                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3936         } else if (locked) {
3937                 /*
3938                  * We are not interested in doing an iflush if we're
3939                  * in the process of shutting down the filesystem forcibly.
3940                  * So, just reclaim the inode.
3941                  */
3942                 xfs_ifunlock(ip);
3943                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3944         }
3945
3946         xfs_ireclaim(ip);
3947         return 0;
3948 }
3949
3950 int
3951 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
3952 {
3953         int             purged;
3954         struct list_head        *curr, *next;
3955         xfs_inode_t     *ip;
3956         int             done = 0;
3957
3958         while (!done) {
3959                 purged = 0;
3960                 XFS_MOUNT_ILOCK(mp);
3961                 list_for_each_safe(curr, next, &mp->m_del_inodes) {
3962                         ip = list_entry(curr, xfs_inode_t, i_reclaim);
3963                         if (noblock) {
3964                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
3965                                         continue;
3966                                 if (xfs_ipincount(ip) ||
3967                                     !xfs_iflock_nowait(ip)) {
3968                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3969                                         continue;
3970                                 }
3971                         }
3972                         XFS_MOUNT_IUNLOCK(mp);
3973                         xfs_finish_reclaim(ip, noblock,
3974                                 XFS_IFLUSH_DELWRI_ELSE_ASYNC);
3975                         purged = 1;
3976                         break;
3977                 }
3978
3979                 done = !purged;
3980         }
3981
3982         XFS_MOUNT_IUNLOCK(mp);
3983         return 0;
3984 }
3985
3986 /*
3987  * xfs_alloc_file_space()
3988  *      This routine allocates disk space for the given file.
3989  *
3990  *      If alloc_type == 0, this request is for an ALLOCSP type
3991  *      request which will change the file size.  In this case, no
3992  *      DMAPI event will be generated by the call.  A TRUNCATE event
3993  *      will be generated later by xfs_setattr.
3994  *
3995  *      If alloc_type != 0, this request is for a RESVSP type
3996  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
3997  *      lower block boundary byte address is less than the file's
3998  *      length.
3999  *
4000  * RETURNS:
4001  *       0 on success
4002  *      errno on error
4003  *
4004  */
4005 int
4006 xfs_alloc_file_space(
4007         xfs_inode_t             *ip,
4008         xfs_off_t               offset,
4009         xfs_off_t               len,
4010         int                     alloc_type,
4011         int                     attr_flags)
4012 {
4013         xfs_filblks_t           allocated_fsb;
4014         xfs_filblks_t           allocatesize_fsb;
4015         int                     committed;
4016         xfs_off_t               count;
4017         xfs_filblks_t           datablocks;
4018         int                     error;
4019         xfs_fsblock_t           firstfsb;
4020         xfs_bmap_free_t         free_list;
4021         xfs_bmbt_irec_t         *imapp;
4022         xfs_bmbt_irec_t         imaps[1];
4023         xfs_mount_t             *mp;
4024         int                     numrtextents;
4025         int                     reccount;
4026         uint                    resblks;
4027         int                     rt;
4028         int                     rtextsize;
4029         xfs_fileoff_t           startoffset_fsb;
4030         xfs_trans_t             *tp;
4031         int                     xfs_bmapi_flags;
4032
4033         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4034         mp = ip->i_mount;
4035
4036         if (XFS_FORCED_SHUTDOWN(mp))
4037                 return XFS_ERROR(EIO);
4038
4039         /*
4040          * determine if this is a realtime file
4041          */
4042         if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) {
4043                 if (ip->i_d.di_extsize)
4044                         rtextsize = ip->i_d.di_extsize;
4045                 else
4046                         rtextsize = mp->m_sb.sb_rextsize;
4047         } else
4048                 rtextsize = 0;
4049
4050         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4051                 return error;
4052
4053         if (len <= 0)
4054                 return XFS_ERROR(EINVAL);
4055
4056         count = len;
4057         error = 0;
4058         imapp = &imaps[0];
4059         reccount = 1;
4060         xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4061         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4062         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4063
4064         /*      Generate a DMAPI event if needed.       */
4065         if (alloc_type != 0 && offset < ip->i_d.di_size &&
4066                         (attr_flags&ATTR_DMI) == 0  &&
4067                         DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4068                 xfs_off_t           end_dmi_offset;
4069
4070                 end_dmi_offset = offset+len;
4071                 if (end_dmi_offset > ip->i_d.di_size)
4072                         end_dmi_offset = ip->i_d.di_size;
4073                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4074                         offset, end_dmi_offset - offset,
4075                         0, NULL);
4076                 if (error)
4077                         return(error);
4078         }
4079
4080         /*
4081          * allocate file space until done or until there is an error
4082          */
4083 retry:
4084         while (allocatesize_fsb && !error) {
4085                 /*
4086                  * determine if reserving space on
4087                  * the data or realtime partition.
4088                  */
4089                 if (rt) {
4090                         xfs_fileoff_t s, e;
4091
4092                         s = startoffset_fsb;
4093                         do_div(s, rtextsize);
4094                         s *= rtextsize;
4095                         e = roundup_64(startoffset_fsb + allocatesize_fsb,
4096                                 rtextsize);
4097                         numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize;
4098                         datablocks = 0;
4099                 } else {
4100                         datablocks = allocatesize_fsb;
4101                         numrtextents = 0;
4102                 }
4103
4104                 /*
4105                  * allocate and setup the transaction
4106                  */
4107                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4108                 resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
4109                 error = xfs_trans_reserve(tp,
4110                                           resblks,
4111                                           XFS_WRITE_LOG_RES(mp),
4112                                           numrtextents,
4113                                           XFS_TRANS_PERM_LOG_RES,
4114                                           XFS_WRITE_LOG_COUNT);
4115
4116                 /*
4117                  * check for running out of space
4118                  */
4119                 if (error) {
4120                         /*
4121                          * Free the transaction structure.
4122                          */
4123                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4124                         xfs_trans_cancel(tp, 0);
4125                         break;
4126                 }
4127                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4128                 error = XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp,
4129                                 ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
4130                                 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4131                 if (error)
4132                         goto error1;
4133
4134                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4135                 xfs_trans_ihold(tp, ip);
4136
4137                 /*
4138                  * issue the bmapi() call to allocate the blocks
4139                  */
4140                 XFS_BMAP_INIT(&free_list, &firstfsb);
4141                 error = xfs_bmapi(tp, ip, startoffset_fsb,
4142                                   allocatesize_fsb, xfs_bmapi_flags,
4143                                   &firstfsb, 0, imapp, &reccount,
4144                                   &free_list);
4145                 if (error) {
4146                         goto error0;
4147                 }
4148
4149                 /*
4150                  * complete the transaction
4151                  */
4152                 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4153                 if (error) {
4154                         goto error0;
4155                 }
4156
4157                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4158                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4159                 if (error) {
4160                         break;
4161                 }
4162
4163                 allocated_fsb = imapp->br_blockcount;
4164
4165                 if (reccount == 0) {
4166                         error = XFS_ERROR(ENOSPC);
4167                         break;
4168                 }
4169
4170                 startoffset_fsb += allocated_fsb;
4171                 allocatesize_fsb -= allocated_fsb;
4172         }
4173 dmapi_enospc_check:
4174         if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4175             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4176
4177                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4178                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4179                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4180                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4181                 if (error == 0)
4182                         goto retry;     /* Maybe DMAPI app. has made space */
4183                 /* else fall through with error from XFS_SEND_DATA */
4184         }
4185
4186         return error;
4187
4188  error0:
4189         xfs_bmap_cancel(&free_list);
4190  error1:
4191         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4192         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4193         goto dmapi_enospc_check;
4194 }
4195
4196 /*
4197  * Zero file bytes between startoff and endoff inclusive.
4198  * The iolock is held exclusive and no blocks are buffered.
4199  */
4200 STATIC int
4201 xfs_zero_remaining_bytes(
4202         xfs_inode_t             *ip,
4203         xfs_off_t               startoff,
4204         xfs_off_t               endoff)
4205 {
4206         xfs_bmbt_irec_t         imap;
4207         xfs_fileoff_t           offset_fsb;
4208         xfs_off_t               lastoffset;
4209         xfs_off_t               offset;
4210         xfs_buf_t               *bp;
4211         xfs_mount_t             *mp = ip->i_mount;
4212         int                     nimap;
4213         int                     error = 0;
4214
4215         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4216                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4217                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4218
4219         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4220                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4221                 nimap = 1;
4222                 error = xfs_bmapi(NULL, ip, offset_fsb, 1, 0, NULL, 0, &imap,
4223                         &nimap, NULL);
4224                 if (error || nimap < 1)
4225                         break;
4226                 ASSERT(imap.br_blockcount >= 1);
4227                 ASSERT(imap.br_startoff == offset_fsb);
4228                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4229                 if (lastoffset > endoff)
4230                         lastoffset = endoff;
4231                 if (imap.br_startblock == HOLESTARTBLOCK)
4232                         continue;
4233                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4234                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4235                         continue;
4236                 XFS_BUF_UNDONE(bp);
4237                 XFS_BUF_UNWRITE(bp);
4238                 XFS_BUF_READ(bp);
4239                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4240                 xfsbdstrat(mp, bp);
4241                 if ((error = xfs_iowait(bp))) {
4242                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4243                                           mp, bp, XFS_BUF_ADDR(bp));
4244                         break;
4245                 }
4246                 memset(XFS_BUF_PTR(bp) +
4247                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4248                       0, lastoffset - offset + 1);
4249                 XFS_BUF_UNDONE(bp);
4250                 XFS_BUF_UNREAD(bp);
4251                 XFS_BUF_WRITE(bp);
4252                 xfsbdstrat(mp, bp);
4253                 if ((error = xfs_iowait(bp))) {
4254                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4255                                           mp, bp, XFS_BUF_ADDR(bp));
4256                         break;
4257                 }
4258         }
4259         xfs_buf_free(bp);
4260         return error;
4261 }
4262
4263 /*
4264  * xfs_free_file_space()
4265  *      This routine frees disk space for the given file.
4266  *
4267  *      This routine is only called by xfs_change_file_space
4268  *      for an UNRESVSP type call.
4269  *
4270  * RETURNS:
4271  *       0 on success
4272  *      errno on error
4273  *
4274  */
4275 STATIC int
4276 xfs_free_file_space(
4277         xfs_inode_t             *ip,
4278         xfs_off_t               offset,
4279         xfs_off_t               len,
4280         int                     attr_flags)
4281 {
4282         int                     committed;
4283         int                     done;
4284         xfs_off_t               end_dmi_offset;
4285         xfs_fileoff_t           endoffset_fsb;
4286         int                     error;
4287         xfs_fsblock_t           firstfsb;
4288         xfs_bmap_free_t         free_list;
4289         xfs_off_t               ilen;
4290         xfs_bmbt_irec_t         imap;
4291         xfs_off_t               ioffset;
4292         xfs_extlen_t            mod=0;
4293         xfs_mount_t             *mp;
4294         int                     nimap;
4295         uint                    resblks;
4296         int                     rounding;
4297         int                     rt;
4298         xfs_fileoff_t           startoffset_fsb;
4299         xfs_trans_t             *tp;
4300
4301         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4302         mp = ip->i_mount;
4303
4304         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4305                 return error;
4306
4307         error = 0;
4308         if (len <= 0)   /* if nothing being freed */
4309                 return error;
4310         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4311         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4312         end_dmi_offset = offset + len;
4313         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4314
4315         if (offset < ip->i_d.di_size &&
4316             (attr_flags & ATTR_DMI) == 0 &&
4317             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4318                 if (end_dmi_offset > ip->i_d.di_size)
4319                         end_dmi_offset = ip->i_d.di_size;
4320                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4321                                 offset, end_dmi_offset - offset,
4322                                 AT_DELAY_FLAG(attr_flags), NULL);
4323                 if (error)
4324                         return(error);
4325         }
4326
4327         xfs_ilock(ip, XFS_IOLOCK_EXCL);
4328         rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog),
4329                         (__uint8_t)NBPP);
4330         ilen = len + (offset & (rounding - 1));
4331         ioffset = offset & ~(rounding - 1);
4332         if (ilen & (rounding - 1))
4333                 ilen = (ilen + rounding) & ~(rounding - 1);
4334         xfs_inval_cached_pages(XFS_ITOV(ip), &(ip->i_iocore), ioffset, 0, 0);
4335         /*
4336          * Need to zero the stuff we're not freeing, on disk.
4337          * If its a realtime file & can't use unwritten extents then we
4338          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4339          * will take care of it for us.
4340          */
4341         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4342                 nimap = 1;
4343                 error = xfs_bmapi(NULL, ip, startoffset_fsb, 1, 0, NULL, 0,
4344                         &imap, &nimap, NULL);
4345                 if (error) {
4346                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4347                         return error;
4348                 }
4349                 ASSERT(nimap == 0 || nimap == 1);
4350                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4351                         xfs_daddr_t     block;
4352
4353                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4354                         block = imap.br_startblock;
4355                         mod = do_div(block, mp->m_sb.sb_rextsize);
4356                         if (mod)
4357                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4358                 }
4359                 nimap = 1;
4360                 error = xfs_bmapi(NULL, ip, endoffset_fsb - 1, 1, 0, NULL, 0,
4361                         &imap, &nimap, NULL);
4362                 if (error) {
4363                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4364                         return error;
4365                 }
4366                 ASSERT(nimap == 0 || nimap == 1);
4367                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4368                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4369                         mod++;
4370                         if (mod && (mod != mp->m_sb.sb_rextsize))
4371                                 endoffset_fsb -= mod;
4372                 }
4373         }
4374         if ((done = (endoffset_fsb <= startoffset_fsb)))
4375                 /*
4376                  * One contiguous piece to clear
4377                  */
4378                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4379         else {
4380                 /*
4381                  * Some full blocks, possibly two pieces to clear
4382                  */
4383                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4384                         error = xfs_zero_remaining_bytes(ip, offset,
4385                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4386                 if (!error &&
4387                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4388                         error = xfs_zero_remaining_bytes(ip,
4389                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4390                                 offset + len - 1);
4391         }
4392
4393         /*
4394          * free file space until done or until there is an error
4395          */
4396         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4397         while (!error && !done) {
4398
4399                 /*
4400                  * allocate and setup the transaction
4401                  */
4402                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4403                 error = xfs_trans_reserve(tp,
4404                                           resblks,
4405                                           XFS_WRITE_LOG_RES(mp),
4406                                           0,
4407                                           XFS_TRANS_PERM_LOG_RES,
4408                                           XFS_WRITE_LOG_COUNT);
4409
4410                 /*
4411                  * check for running out of space
4412                  */
4413                 if (error) {
4414                         /*
4415                          * Free the transaction structure.
4416                          */
4417                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4418                         xfs_trans_cancel(tp, 0);
4419                         break;
4420                 }
4421                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4422                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4423                                 ip->i_udquot, ip->i_gdquot, resblks, 0, rt ?
4424                                 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
4425                 if (error)
4426                         goto error1;
4427
4428                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4429                 xfs_trans_ihold(tp, ip);
4430
4431                 /*
4432                  * issue the bunmapi() call to free the blocks
4433                  */
4434                 XFS_BMAP_INIT(&free_list, &firstfsb);
4435                 error = xfs_bunmapi(tp, ip, startoffset_fsb,
4436                                   endoffset_fsb - startoffset_fsb,
4437                                   0, 2, &firstfsb, &free_list, &done);
4438                 if (error) {
4439                         goto error0;
4440                 }
4441
4442                 /*
4443                  * complete the transaction
4444                  */
4445                 error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed);
4446                 if (error) {
4447                         goto error0;
4448                 }
4449
4450                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL);
4451                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4452         }
4453
4454         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4455         return error;
4456
4457  error0:
4458         xfs_bmap_cancel(&free_list);
4459  error1:
4460         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4461         xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
4462         return error;
4463 }
4464
4465 /*
4466  * xfs_change_file_space()
4467  *      This routine allocates or frees disk space for the given file.
4468  *      The user specified parameters are checked for alignment and size
4469  *      limitations.
4470  *
4471  * RETURNS:
4472  *       0 on success
4473  *      errno on error
4474  *
4475  */
4476 int
4477 xfs_change_file_space(
4478         bhv_desc_t      *bdp,
4479         int             cmd,
4480         xfs_flock64_t   *bf,
4481         xfs_off_t       offset,
4482         cred_t          *credp,
4483         int             attr_flags)
4484 {
4485         int             clrprealloc;
4486         int             error;
4487         xfs_fsize_t     fsize;
4488         xfs_inode_t     *ip;
4489         xfs_mount_t     *mp;
4490         int             setprealloc;
4491         xfs_off_t       startoffset;
4492         xfs_off_t       llen;
4493         xfs_trans_t     *tp;
4494         vattr_t         va;
4495         vnode_t         *vp;
4496
4497         vp = BHV_TO_VNODE(bdp);
4498         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4499
4500         ip = XFS_BHVTOI(bdp);
4501         mp = ip->i_mount;
4502
4503         /*
4504          * must be a regular file and have write permission
4505          */
4506         if (vp->v_type != VREG)
4507                 return XFS_ERROR(EINVAL);
4508
4509         xfs_ilock(ip, XFS_ILOCK_SHARED);
4510
4511         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4512                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4513                 return error;
4514         }
4515
4516         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4517
4518         switch (bf->l_whence) {
4519         case 0: /*SEEK_SET*/
4520                 break;
4521         case 1: /*SEEK_CUR*/
4522                 bf->l_start += offset;
4523                 break;
4524         case 2: /*SEEK_END*/
4525                 bf->l_start += ip->i_d.di_size;
4526                 break;
4527         default:
4528                 return XFS_ERROR(EINVAL);
4529         }
4530
4531         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4532
4533         if (   (bf->l_start < 0)
4534             || (bf->l_start > XFS_MAXIOFFSET(mp))
4535             || (bf->l_start + llen < 0)
4536             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4537                 return XFS_ERROR(EINVAL);
4538
4539         bf->l_whence = 0;
4540
4541         startoffset = bf->l_start;
4542         fsize = ip->i_d.di_size;
4543
4544         /*
4545          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4546          * file space.
4547          * These calls do NOT zero the data space allocated to the file,
4548          * nor do they change the file size.
4549          *
4550          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4551          * space.
4552          * These calls cause the new file data to be zeroed and the file
4553          * size to be changed.
4554          */
4555         setprealloc = clrprealloc = 0;
4556
4557         switch (cmd) {
4558         case XFS_IOC_RESVSP:
4559         case XFS_IOC_RESVSP64:
4560                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4561                                                                 1, attr_flags);
4562                 if (error)
4563                         return error;
4564                 setprealloc = 1;
4565                 break;
4566
4567         case XFS_IOC_UNRESVSP:
4568         case XFS_IOC_UNRESVSP64:
4569                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4570                                                                 attr_flags)))
4571                         return error;
4572                 break;
4573
4574         case XFS_IOC_ALLOCSP:
4575         case XFS_IOC_ALLOCSP64:
4576         case XFS_IOC_FREESP:
4577         case XFS_IOC_FREESP64:
4578                 if (startoffset > fsize) {
4579                         error = xfs_alloc_file_space(ip, fsize,
4580                                         startoffset - fsize, 0, attr_flags);
4581                         if (error)
4582                                 break;
4583                 }
4584
4585                 va.va_mask = XFS_AT_SIZE;
4586                 va.va_size = startoffset;
4587
4588                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4589
4590                 if (error)
4591                         return error;
4592
4593                 clrprealloc = 1;
4594                 break;
4595
4596         default:
4597                 ASSERT(0);
4598                 return XFS_ERROR(EINVAL);
4599         }
4600
4601         /*
4602          * update the inode timestamp, mode, and prealloc flag bits
4603          */
4604         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4605
4606         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4607                                       0, 0, 0))) {
4608                 /* ASSERT(0); */
4609                 xfs_trans_cancel(tp, 0);
4610                 return error;
4611         }
4612
4613         xfs_ilock(ip, XFS_ILOCK_EXCL);
4614
4615         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4616         xfs_trans_ihold(tp, ip);
4617
4618         ip->i_d.di_mode &= ~S_ISUID;
4619
4620         /*
4621          * Note that we don't have to worry about mandatory
4622          * file locking being disabled here because we only
4623          * clear the S_ISGID bit if the Group execute bit is
4624          * on, but if it was on then mandatory locking wouldn't
4625          * have been enabled.
4626          */
4627         if (ip->i_d.di_mode & S_IXGRP)
4628                 ip->i_d.di_mode &= ~S_ISGID;
4629
4630         xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4631
4632         if (setprealloc)
4633                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4634         else if (clrprealloc)
4635                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4636
4637         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4638         xfs_trans_set_sync(tp);
4639
4640         error = xfs_trans_commit(tp, 0, NULL);
4641
4642         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4643
4644         return error;
4645 }
4646
4647 vnodeops_t xfs_vnodeops = {
4648         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4649         .vop_open               = xfs_open,
4650         .vop_read               = xfs_read,
4651 #ifdef HAVE_SENDFILE
4652         .vop_sendfile           = xfs_sendfile,
4653 #endif
4654         .vop_write              = xfs_write,
4655         .vop_ioctl              = xfs_ioctl,
4656         .vop_getattr            = xfs_getattr,
4657         .vop_setattr            = xfs_setattr,
4658         .vop_access             = xfs_access,
4659         .vop_lookup             = xfs_lookup,
4660         .vop_create             = xfs_create,
4661         .vop_remove             = xfs_remove,
4662         .vop_link               = xfs_link,
4663         .vop_rename             = xfs_rename,
4664         .vop_mkdir              = xfs_mkdir,
4665         .vop_rmdir              = xfs_rmdir,
4666         .vop_readdir            = xfs_readdir,
4667         .vop_symlink            = xfs_symlink,
4668         .vop_readlink           = xfs_readlink,
4669         .vop_fsync              = xfs_fsync,
4670         .vop_inactive           = xfs_inactive,
4671         .vop_fid2               = xfs_fid2,
4672         .vop_rwlock             = xfs_rwlock,
4673         .vop_rwunlock           = xfs_rwunlock,
4674         .vop_bmap               = xfs_bmap,
4675         .vop_reclaim            = xfs_reclaim,
4676         .vop_attr_get           = xfs_attr_get,
4677         .vop_attr_set           = xfs_attr_set,
4678         .vop_attr_remove        = xfs_attr_remove,
4679         .vop_attr_list          = xfs_attr_list,
4680         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4681         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4682         .vop_tosspages          = fs_tosspages,
4683         .vop_flushinval_pages   = fs_flushinval_pages,
4684         .vop_flush_pages        = fs_flush_pages,
4685         .vop_release            = xfs_release,
4686         .vop_iflush             = xfs_inode_flush,
4687 };