fs/xfs/linux-2.6/xfs_lrw.c

   1 /*
   2  * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms of version 2 of the GNU General Public License as
   6  * published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it would be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  11  *
  12  * Further, this software is distributed without any warranty that it is
  13  * free of the rightful claim of any third person regarding infringement
  14  * or the like.  Any license provided herein, whether implied or
  15  * otherwise, applies only to this software file.  Patent licenses, if
  16  * any, provided herein do not apply to combinations of this program with
  17  * other software, or any other product whatsoever.
  18  *
  19  * You should have received a copy of the GNU General Public License along
  20  * with this program; if not, write the Free Software Foundation, Inc., 59
  21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
  22  *
  23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
  24  * Mountain View, CA  94043, or:
  25  *
  26  * http://www.sgi.com
  27  *
  28  * For further information regarding this notice, see:
  29  *
  30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
  31  */
  32 /*
  33  *  fs/xfs/linux/xfs_lrw.c (Linux Read Write stuff)
  34  *
  35  */
  36
  37 #include "xfs.h"
  38
  39 #include "xfs_fs.h"
  40 #include "xfs_inum.h"
  41 #include "xfs_log.h"
  42 #include "xfs_trans.h"
  43 #include "xfs_sb.h"
  44 #include "xfs_ag.h"
  45 #include "xfs_dir.h"
  46 #include "xfs_dir2.h"
  47 #include "xfs_alloc.h"
  48 #include "xfs_dmapi.h"
  49 #include "xfs_quota.h"
  50 #include "xfs_mount.h"
  51 #include "xfs_alloc_btree.h"
  52 #include "xfs_bmap_btree.h"
  53 #include "xfs_ialloc_btree.h"
  54 #include "xfs_btree.h"
  55 #include "xfs_ialloc.h"
  56 #include "xfs_attr_sf.h"
  57 #include "xfs_dir_sf.h"
  58 #include "xfs_dir2_sf.h"
  59 #include "xfs_dinode.h"
  60 #include "xfs_inode.h"
  61 #include "xfs_bmap.h"
  62 #include "xfs_bit.h"
  63 #include "xfs_rtalloc.h"
  64 #include "xfs_error.h"
  65 #include "xfs_itable.h"
  66 #include "xfs_rw.h"
  67 #include "xfs_acl.h"
  68 #include "xfs_cap.h"
  69 #include "xfs_mac.h"
  70 #include "xfs_attr.h"
  71 #include "xfs_inode_item.h"
  72 #include "xfs_buf_item.h"
  73 #include "xfs_utils.h"
  74 #include "xfs_iomap.h"
  75
  76 #include <linux/capability.h>
  77
  78
  79 #if defined(XFS_RW_TRACE)
  80 void
  81 xfs_rw_enter_trace(
  82         int                     tag,
  83         xfs_iocore_t            *io,
  84         const struct iovec      *iovp,
  85         size_t                  segs,
  86         loff_t                  offset,
  87         int                     ioflags)
  88 {
  89         xfs_inode_t     *ip = XFS_IO_INODE(io);
  90
  91         if (ip->i_rwtrace == NULL)
  92                 return;
  93         ktrace_enter(ip->i_rwtrace,
  94                 (void *)(unsigned long)tag,
  95                 (void *)ip,
  96                 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
  97                 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
  98                 (void *)(__psint_t)iovp,
  99                 (void *)((unsigned long)segs),
 100                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 101                 (void *)((unsigned long)(offset & 0xffffffff)),
 102                 (void *)((unsigned long)ioflags),
 103                 (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)),
 104                 (void *)((unsigned long)(io->io_new_size & 0xffffffff)),
 105                 (void *)NULL,
 106                 (void *)NULL,
 107                 (void *)NULL,
 108                 (void *)NULL,
 109                 (void *)NULL);
 110 }
 111
 112 void
 113 xfs_inval_cached_trace(
 114         xfs_iocore_t    *io,
 115         xfs_off_t       offset,
 116         xfs_off_t       len,
 117         xfs_off_t       first,
 118         xfs_off_t       last)
 119 {
 120         xfs_inode_t     *ip = XFS_IO_INODE(io);
 121
 122         if (ip->i_rwtrace == NULL)
 123                 return;
 124         ktrace_enter(ip->i_rwtrace,
 125                 (void *)(__psint_t)XFS_INVAL_CACHED,
 126                 (void *)ip,
 127                 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 128                 (void *)((unsigned long)(offset & 0xffffffff)),
 129                 (void *)((unsigned long)((len >> 32) & 0xffffffff)),
 130                 (void *)((unsigned long)(len & 0xffffffff)),
 131                 (void *)((unsigned long)((first >> 32) & 0xffffffff)),
 132                 (void *)((unsigned long)(first & 0xffffffff)),
 133                 (void *)((unsigned long)((last >> 32) & 0xffffffff)),
 134                 (void *)((unsigned long)(last & 0xffffffff)),
 135                 (void *)NULL,
 136                 (void *)NULL,
 137                 (void *)NULL,
 138                 (void *)NULL,
 139                 (void *)NULL,
 140                 (void *)NULL);
 141 }
 142 #endif
 143
 144 /*
 145  *      xfs_iozero
 146  *
 147  *      xfs_iozero clears the specified range of buffer supplied,
 148  *      and marks all the affected blocks as valid and modified.  If
 149  *      an affected block is not allocated, it will be allocated.  If
 150  *      an affected block is not completely overwritten, and is not
 151  *      valid before the operation, it will be read from disk before
 152  *      being partially zeroed.
 153  */
 154 STATIC int
 155 xfs_iozero(
 156         struct inode            *ip,    /* inode                        */
 157         loff_t                  pos,    /* offset in file               */
 158         size_t                  count,  /* size of data to zero         */
 159         loff_t                  end_size)       /* max file size to set */
 160 {
 161         unsigned                bytes;
 162         struct page             *page;
 163         struct address_space    *mapping;
 164         char                    *kaddr;
 165         int                     status;
 166
 167         mapping = ip->i_mapping;
 168         do {
 169                 unsigned long index, offset;
 170
 171                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 172                 index = pos >> PAGE_CACHE_SHIFT;
 173                 bytes = PAGE_CACHE_SIZE - offset;
 174                 if (bytes > count)
 175                         bytes = count;
 176
 177                 status = -ENOMEM;
 178                 page = grab_cache_page(mapping, index);
 179                 if (!page)
 180                         break;
 181
 182                 kaddr = kmap(page);
 183                 status = mapping->a_ops->prepare_write(NULL, page, offset,
 184                                                         offset + bytes);
 185                 if (status) {
 186                         goto unlock;
 187                 }
 188
 189                 memset((void *) (kaddr + offset), 0, bytes);
 190                 flush_dcache_page(page);
 191                 status = mapping->a_ops->commit_write(NULL, page, offset,
 192                                                         offset + bytes);
 193                 if (!status) {
 194                         pos += bytes;
 195                         count -= bytes;
 196                         if (pos > i_size_read(ip))
 197                                 i_size_write(ip, pos < end_size ? pos : end_size);
 198                 }
 199
 200 unlock:
 201                 kunmap(page);
 202                 unlock_page(page);
 203                 page_cache_release(page);
 204                 if (status)
 205                         break;
 206         } while (count);
 207
 208         return (-status);
 209 }
 210
 211 /*
 212  * xfs_inval_cached_pages
 213  *
 214  * This routine is responsible for keeping direct I/O and buffered I/O
 215  * somewhat coherent.  From here we make sure that we're at least
 216  * temporarily holding the inode I/O lock exclusively and then call
 217  * the page cache to flush and invalidate any cached pages.  If there
 218  * are no cached pages this routine will be very quick.
 219  */
 220 void
 221 xfs_inval_cached_pages(
 222         vnode_t         *vp,
 223         xfs_iocore_t    *io,
 224         xfs_off_t       offset,
 225         int             write,
 226         int             relock)
 227 {
 228 #if 0
 229         xfs_mount_t     *mp;
 230 #endif
 231
 232         if (!VN_CACHED(vp)) {
 233                 return;
 234         }
 235
 236 #if 0
 237         mp = io->io_mount;
 238
 239         /*
 240          * We need to get the I/O lock exclusively in order
 241          * to safely invalidate pages and mappings.
 242          */
 243         if (relock) {
 244                 XFS_IUNLOCK(mp, io, XFS_IOLOCK_SHARED);
 245                 XFS_ILOCK(mp, io, XFS_IOLOCK_EXCL);
 246         }
 247
 248         /* Writing beyond EOF creates a hole that must be zeroed */
 249         if (write && (offset > XFS_SIZE(mp, io))) {
 250                 xfs_fsize_t     isize;
 251
 252                 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 253                 isize = XFS_SIZE(mp, io);
 254                 if (offset > isize) {
 255                         xfs_zero_eof(vp, io, offset, isize, offset);
 256                 }
 257                 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 258         }
 259 #endif
 260
 261         xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1);
 262         VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED);
 263 #if 0
 264         if (relock) {
 265                 XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL);
 266         }
 267 #endif
 268 }
 269
 270 ssize_t                 /* bytes read, or (-)  error */
 271 xfs_read(
 272         bhv_desc_t              *bdp,
 273         struct kiocb            *iocb,
 274         const struct iovec      *iovp,
 275         unsigned int            segs,
 276         loff_t                  *offset,
 277         int                     ioflags,
 278         cred_t                  *credp)
 279 {
 280         struct file             *file = iocb->ki_filp;
 281         size_t                  size = 0;
 282         ssize_t                 ret;
 283         xfs_fsize_t             n;
 284         xfs_inode_t             *ip;
 285         xfs_mount_t             *mp;
 286         vnode_t                 *vp;
 287         unsigned long           seg;
 288
 289         ip = XFS_BHVTOI(bdp);
 290         vp = BHV_TO_VNODE(bdp);
 291         mp = ip->i_mount;
 292
 293         XFS_STATS_INC(xs_read_calls);
 294
 295         /* START copy & waste from filemap.c */
 296         for (seg = 0; seg < segs; seg++) {
 297                 const struct iovec *iv = &iovp[seg];
 298
 299                 /*
 300                  * If any segment has a negative length, or the cumulative
 301                  * length ever wraps negative then return -EINVAL.
 302                  */
 303                 size += iv->iov_len;
 304                 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
 305                         return XFS_ERROR(-EINVAL);
 306         }
 307         /* END copy & waste from filemap.c */
 308
 309         if (ioflags & IO_ISDIRECT) {
 310                 xfs_buftarg_t   *target =
 311                         (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 312                                 mp->m_rtdev_targp : mp->m_ddev_targp;
 313                 if ((*offset & target->pbr_smask) ||
 314                     (size & target->pbr_smask)) {
 315                         if (*offset == ip->i_d.di_size) {
 316                                 return (0);
 317                         }
 318                         return -XFS_ERROR(EINVAL);
 319                 }
 320         }
 321
 322         n = XFS_MAXIOFFSET(mp) - *offset;
 323         if ((n <= 0) || (size == 0))
 324                 return 0;
 325
 326         if (n < size)
 327                 size = n;
 328
 329         if (XFS_FORCED_SHUTDOWN(mp)) {
 330                 return -EIO;
 331         }
 332
 333         /* OK so we are holding the I/O lock for the duration
 334          * of the submission, then what happens if the I/O
 335          * does not really happen here, but is scheduled
 336          * later?
 337          */
 338         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 339
 340         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
 341             !(ioflags & IO_INVIS)) {
 342                 vrwlock_t locktype = VRWLOCK_READ;
 343
 344                 ret = XFS_SEND_DATA(mp, DM_EVENT_READ,
 345                                         BHV_TO_VNODE(bdp), *offset, size,
 346                                         FILP_DELAY_FLAG(file), &locktype);
 347                 if (ret) {
 348                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 349                         return -ret;
 350                 }
 351         }
 352
 353         xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
 354                                 iovp, segs, *offset, ioflags);
 355         ret = __generic_file_aio_read(iocb, iovp, segs, offset);
 356         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 357
 358         if (ret > 0)
 359                 XFS_STATS_ADD(xs_read_bytes, ret);
 360
 361         if (likely(!(ioflags & IO_INVIS)))
 362                 xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
 363
 364         return ret;
 365 }
 366
 367 ssize_t
 368 xfs_sendfile(
 369         bhv_desc_t              *bdp,
 370         struct file             *filp,
 371         loff_t                  *offset,
 372         int                     ioflags,
 373         size_t                  count,
 374         read_actor_t            actor,
 375         void                    *target,
 376         cred_t                  *credp)
 377 {
 378         ssize_t                 ret;
 379         xfs_fsize_t             n;
 380         xfs_inode_t             *ip;
 381         xfs_mount_t             *mp;
 382         vnode_t                 *vp;
 383
 384         ip = XFS_BHVTOI(bdp);
 385         vp = BHV_TO_VNODE(bdp);
 386         mp = ip->i_mount;
 387
 388         XFS_STATS_INC(xs_read_calls);
 389
 390         n = XFS_MAXIOFFSET(mp) - *offset;
 391         if ((n <= 0) || (count == 0))
 392                 return 0;
 393
 394         if (n < count)
 395                 count = n;
 396
 397         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 398                 return -EIO;
 399
 400         xfs_ilock(ip, XFS_IOLOCK_SHARED);
 401
 402         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) &&
 403             (!(ioflags & IO_INVIS))) {
 404                 vrwlock_t locktype = VRWLOCK_READ;
 405                 int error;
 406
 407                 error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count,
 408                                       FILP_DELAY_FLAG(filp), &locktype);
 409                 if (error) {
 410                         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 411                         return -error;
 412                 }
 413         }
 414         xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore,
 415                                 target, count, *offset, ioflags);
 416         ret = generic_file_sendfile(filp, offset, count, actor, target);
 417         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 418
 419         XFS_STATS_ADD(xs_read_bytes, ret);
 420         xfs_ichgtime(ip, XFS_ICHGTIME_ACC);
 421         return ret;
 422 }
 423
 424 /*
 425  * This routine is called to handle zeroing any space in the last
 426  * block of the file that is beyond the EOF.  We do this since the
 427  * size is being increased without writing anything to that block
 428  * and we don't want anyone to read the garbage on the disk.
 429  */
 430 STATIC int                              /* error (positive) */
 431 xfs_zero_last_block(
 432         struct inode    *ip,
 433         xfs_iocore_t    *io,
 434         xfs_off_t       offset,
 435         xfs_fsize_t     isize,
 436         xfs_fsize_t     end_size)
 437 {
 438         xfs_fileoff_t   last_fsb;
 439         xfs_mount_t     *mp;
 440         int             nimaps;
 441         int             zero_offset;
 442         int             zero_len;
 443         int             isize_fsb_offset;
 444         int             error = 0;
 445         xfs_bmbt_irec_t imap;
 446         loff_t          loff;
 447         size_t          lsize;
 448
 449         ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0);
 450         ASSERT(offset > isize);
 451
 452         mp = io->io_mount;
 453
 454         isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize);
 455         if (isize_fsb_offset == 0) {
 456                 /*
 457                  * There are no extra bytes in the last block on disk to
 458                  * zero, so return.
 459                  */
 460                 return 0;
 461         }
 462
 463         last_fsb = XFS_B_TO_FSBT(mp, isize);
 464         nimaps = 1;
 465         error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap,
 466                           &nimaps, NULL);
 467         if (error) {
 468                 return error;
 469         }
 470         ASSERT(nimaps > 0);
 471         /*
 472          * If the block underlying isize is just a hole, then there
 473          * is nothing to zero.
 474          */
 475         if (imap.br_startblock == HOLESTARTBLOCK) {
 476                 return 0;
 477         }
 478         /*
 479          * Zero the part of the last block beyond the EOF, and write it
 480          * out sync.  We need to drop the ilock while we do this so we
 481          * don't deadlock when the buffer cache calls back to us.
 482          */
 483         XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD);
 484         loff = XFS_FSB_TO_B(mp, last_fsb);
 485         lsize = XFS_FSB_TO_B(mp, 1);
 486
 487         zero_offset = isize_fsb_offset;
 488         zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset;
 489
 490         error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size);
 491
 492         XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 493         ASSERT(error >= 0);
 494         return error;
 495 }
 496
 497 /*
 498  * Zero any on disk space between the current EOF and the new,
 499  * larger EOF.  This handles the normal case of zeroing the remainder
 500  * of the last block in the file and the unusual case of zeroing blocks
 501  * out beyond the size of the file.  This second case only happens
 502  * with fixed size extents and when the system crashes before the inode
 503  * size was updated but after blocks were allocated.  If fill is set,
 504  * then any holes in the range are filled and zeroed.  If not, the holes
 505  * are left alone as holes.
 506  */
 507
 508 int                                     /* error (positive) */
 509 xfs_zero_eof(
 510         vnode_t         *vp,
 511         xfs_iocore_t    *io,
 512         xfs_off_t       offset,         /* starting I/O offset */
 513         xfs_fsize_t     isize,          /* current inode size */
 514         xfs_fsize_t     end_size)       /* terminal inode size */
 515 {
 516         struct inode    *ip = LINVFS_GET_IP(vp);
 517         xfs_fileoff_t   start_zero_fsb;
 518         xfs_fileoff_t   end_zero_fsb;
 519         xfs_fileoff_t   prev_zero_fsb;
 520         xfs_fileoff_t   zero_count_fsb;
 521         xfs_fileoff_t   last_fsb;
 522         xfs_extlen_t    buf_len_fsb;
 523         xfs_extlen_t    prev_zero_count;
 524         xfs_mount_t     *mp;
 525         int             nimaps;
 526         int             error = 0;
 527         xfs_bmbt_irec_t imap;
 528         loff_t          loff;
 529         size_t          lsize;
 530
 531         ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 532         ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 533
 534         mp = io->io_mount;
 535
 536         /*
 537          * First handle zeroing the block on which isize resides.
 538          * We only zero a part of that block so it is handled specially.
 539          */
 540         error = xfs_zero_last_block(ip, io, offset, isize, end_size);
 541         if (error) {
 542                 ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 543                 ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 544                 return error;
 545         }
 546
 547         /*
 548          * Calculate the range between the new size and the old
 549          * where blocks needing to be zeroed may exist.  To get the
 550          * block where the last byte in the file currently resides,
 551          * we need to subtract one from the size and truncate back
 552          * to a block boundary.  We subtract 1 in case the size is
 553          * exactly on a block boundary.
 554          */
 555         last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
 556         start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
 557         end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
 558         ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
 559         if (last_fsb == end_zero_fsb) {
 560                 /*
 561                  * The size was only incremented on its last block.
 562                  * We took care of that above, so just return.
 563                  */
 564                 return 0;
 565         }
 566
 567         ASSERT(start_zero_fsb <= end_zero_fsb);
 568         prev_zero_fsb = NULLFILEOFF;
 569         prev_zero_count = 0;
 570         while (start_zero_fsb <= end_zero_fsb) {
 571                 nimaps = 1;
 572                 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 573                 error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb,
 574                                   0, NULL, 0, &imap, &nimaps, NULL);
 575                 if (error) {
 576                         ASSERT(ismrlocked(io->io_lock, MR_UPDATE));
 577                         ASSERT(ismrlocked(io->io_iolock, MR_UPDATE));
 578                         return error;
 579                 }
 580                 ASSERT(nimaps > 0);
 581
 582                 if (imap.br_state == XFS_EXT_UNWRITTEN ||
 583                     imap.br_startblock == HOLESTARTBLOCK) {
 584                         /*
 585                          * This loop handles initializing pages that were
 586                          * partially initialized by the code below this
 587                          * loop. It basically zeroes the part of the page
 588                          * that sits on a hole and sets the page as P_HOLE
 589                          * and calls remapf if it is a mapped file.
 590                          */
 591                         prev_zero_fsb = NULLFILEOFF;
 592                         prev_zero_count = 0;
 593                         start_zero_fsb = imap.br_startoff +
 594                                          imap.br_blockcount;
 595                         ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 596                         continue;
 597                 }
 598
 599                 /*
 600                  * There are blocks in the range requested.
 601                  * Zero them a single write at a time.  We actually
 602                  * don't zero the entire range returned if it is
 603                  * too big and simply loop around to get the rest.
 604                  * That is not the most efficient thing to do, but it
 605                  * is simple and this path should not be exercised often.
 606                  */
 607                 buf_len_fsb = XFS_FILBLKS_MIN(imap.br_blockcount,
 608                                               mp->m_writeio_blocks << 8);
 609                 /*
 610                  * Drop the inode lock while we're doing the I/O.
 611                  * We'll still have the iolock to protect us.
 612                  */
 613                 XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 614
 615                 loff = XFS_FSB_TO_B(mp, start_zero_fsb);
 616                 lsize = XFS_FSB_TO_B(mp, buf_len_fsb);
 617
 618                 error = xfs_iozero(ip, loff, lsize, end_size);
 619
 620                 if (error) {
 621                         goto out_lock;
 622                 }
 623
 624                 prev_zero_fsb = start_zero_fsb;
 625                 prev_zero_count = buf_len_fsb;
 626                 start_zero_fsb = imap.br_startoff + buf_len_fsb;
 627                 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 628
 629                 XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 630         }
 631
 632         return 0;
 633
 634 out_lock:
 635
 636         XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD);
 637         ASSERT(error >= 0);
 638         return error;
 639 }
 640
 641 ssize_t                         /* bytes written, or (-) error */
 642 xfs_write(
 643         bhv_desc_t              *bdp,
 644         struct kiocb            *iocb,
 645         const struct iovec      *iovp,
 646         unsigned int            segs,
 647         loff_t                  *offset,
 648         int                     ioflags,
 649         cred_t                  *credp)
 650 {
 651         struct file             *file = iocb->ki_filp;
 652         size_t                  size = 0;
 653         xfs_inode_t             *xip;
 654         xfs_mount_t             *mp;
 655         ssize_t                 ret;
 656         int                     error = 0;
 657         xfs_fsize_t             isize, new_size;
 658         xfs_fsize_t             n, limit;
 659         xfs_iocore_t            *io;
 660         vnode_t                 *vp;
 661         unsigned long           seg;
 662         int                     iolock;
 663         int                     eventsent = 0;
 664         vrwlock_t               locktype;
 665
 666         XFS_STATS_INC(xs_write_calls);
 667
 668         vp = BHV_TO_VNODE(bdp);
 669         xip = XFS_BHVTOI(bdp);
 670
 671         /* START copy & waste from filemap.c */
 672         for (seg = 0; seg < segs; seg++) {
 673                 const struct iovec *iv = &iovp[seg];
 674
 675                 /*
 676                  * If any segment has a negative length, or the cumulative
 677                  * length ever wraps negative then return -EINVAL.
 678                  */
 679                 size += iv->iov_len;
 680                 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
 681                         return XFS_ERROR(-EINVAL);
 682         }
 683         /* END copy & waste from filemap.c */
 684
 685         if (size == 0)
 686                 return 0;
 687
 688         io = &xip->i_iocore;
 689         mp = io->io_mount;
 690
 691         if (XFS_FORCED_SHUTDOWN(mp)) {
 692                 return -EIO;
 693         }
 694
 695         if (ioflags & IO_ISDIRECT) {
 696                 xfs_buftarg_t   *target =
 697                         (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
 698                                 mp->m_rtdev_targp : mp->m_ddev_targp;
 699
 700                 if ((*offset & target->pbr_smask) ||
 701                     (size & target->pbr_smask)) {
 702                         return XFS_ERROR(-EINVAL);
 703                 }
 704 #if 0
 705                 iolock = XFS_IOLOCK_SHARED;
 706                 locktype = VRWLOCK_WRITE_DIRECT;
 707         } else {
 708                 iolock = XFS_IOLOCK_EXCL;
 709                 locktype = VRWLOCK_WRITE;
 710 #endif
 711         }
 712 #if 1
 713         iolock = XFS_IOLOCK_EXCL;
 714         locktype = VRWLOCK_WRITE;
 715 #endif
 716
 717         xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 718
 719         isize = xip->i_d.di_size;
 720         limit = XFS_MAXIOFFSET(mp);
 721
 722         if (file->f_flags & O_APPEND)
 723                 *offset = isize;
 724
 725 start:
 726         n = limit - *offset;
 727         if (n <= 0) {
 728                 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 729                 return -EFBIG;
 730         }
 731
 732         if (n < size)
 733                 size = n;
 734
 735         new_size = *offset + size;
 736         if (new_size > isize) {
 737                 io->io_new_size = new_size;
 738         }
 739
 740         if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) &&
 741             !(ioflags & IO_INVIS) && !eventsent)) {
 742                 loff_t          savedsize = *offset;
 743                 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 744
 745                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 746                 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp,
 747                                       *offset, size,
 748                                       dmflags, &locktype);
 749                 if (error) {
 750                         xfs_iunlock(xip, iolock);
 751                         return -error;
 752                 }
 753                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 754                 eventsent = 1;
 755
 756                 /*
 757                  * The iolock was dropped and reaquired in XFS_SEND_DATA
 758                  * so we have to recheck the size when appending.
 759                  * We will only "goto start;" once, since having sent the
 760                  * event prevents another call to XFS_SEND_DATA, which is
 761                  * what allows the size to change in the first place.
 762                  */
 763                 if ((file->f_flags & O_APPEND) &&
 764                     savedsize != xip->i_d.di_size) {
 765                         *offset = isize = xip->i_d.di_size;
 766                         goto start;
 767                 }
 768         }
 769
 770         /*
 771          * On Linux, generic_file_write updates the times even if
 772          * no data is copied in so long as the write had a size.
 773          *
 774          * We must update xfs' times since revalidate will overcopy xfs.
 775          */
 776         if (size && !(ioflags & IO_INVIS))
 777                 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 778
 779         /*
 780          * If the offset is beyond the size of the file, we have a couple
 781          * of things to do. First, if there is already space allocated
 782          * we need to either create holes or zero the disk or ...
 783          *
 784          * If there is a page where the previous size lands, we need
 785          * to zero it out up to the new size.
 786          */
 787
 788         if ((0 && !(ioflags & IO_ISDIRECT)) && (*offset > isize && isize)) {
 789                 error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset,
 790                         isize, *offset + size);
 791                 if (error) {
 792                         xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 793                         return(-error);
 794                 }
 795         }
 796         xfs_iunlock(xip, XFS_ILOCK_EXCL);
 797
 798         /*
 799          * If we're writing the file then make sure to clear the
 800          * setuid and setgid bits if the process is not being run
 801          * by root.  This keeps people from modifying setuid and
 802          * setgid binaries.
 803          */
 804
 805         if (((xip->i_d.di_mode & S_ISUID) ||
 806             ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
 807                 (S_ISGID | S_IXGRP))) &&
 808              !capable(CAP_FSETID)) {
 809                 error = xfs_write_clear_setuid(xip);
 810                 if (error) {
 811                         xfs_iunlock(xip, iolock);
 812                         return -error;
 813                 }
 814         }
 815
 816 retry:
 817         if (ioflags & IO_ISDIRECT) {
 818                 xfs_inval_cached_pages(vp, io, *offset, 1, 1);
 819                 xfs_rw_enter_trace(XFS_DIOWR_ENTER,
 820                                 io, iovp, segs, *offset, ioflags);
 821         } else {
 822                 xfs_rw_enter_trace(XFS_WRITE_ENTER,
 823                                 io, iovp, segs, *offset, ioflags);
 824         }
 825         ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset);
 826
 827         if ((ret == -ENOSPC) &&
 828             DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) &&
 829             !(ioflags & IO_INVIS)) {
 830
 831                 xfs_rwunlock(bdp, locktype);
 832                 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp,
 833                                 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL,
 834                                 0, 0, 0); /* Delay flag intentionally  unused */
 835                 if (error)
 836                         return -error;
 837                 xfs_rwlock(bdp, locktype);
 838                 *offset = xip->i_d.di_size;
 839                 goto retry;
 840         }
 841
 842         if (*offset > xip->i_d.di_size) {
 843                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 844                 if (*offset > xip->i_d.di_size) {
 845                         struct inode    *inode = LINVFS_GET_IP(vp);
 846
 847                         xip->i_d.di_size = *offset;
 848                         i_size_write(inode, *offset);
 849                         xip->i_update_core = 1;
 850                         xip->i_update_size = 1;
 851                 }
 852                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 853         }
 854
 855         if (ret <= 0) {
 856                 xfs_rwunlock(bdp, locktype);
 857                 return ret;
 858         }
 859
 860         XFS_STATS_ADD(xs_write_bytes, ret);
 861
 862         /* Handle various SYNC-type writes */
 863         if ((file->f_flags & O_SYNC) || IS_SYNC(file->f_dentry->d_inode)) {
 864
 865                 /*
 866                  * If we're treating this as O_DSYNC and we have not updated the
 867                  * size, force the log.
 868                  */
 869
 870                 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC)
 871                         && !(xip->i_update_size)) {
 872                         /*
 873                          * If an allocation transaction occurred
 874                          * without extending the size, then we have to force
 875                          * the log up the proper point to ensure that the
 876                          * allocation is permanent.  We can't count on
 877                          * the fact that buffered writes lock out direct I/O
 878                          * writes - the direct I/O write could have extended
 879                          * the size nontransactionally, then finished before
 880                          * we started.  xfs_write_file will think that the file
 881                          * didn't grow but the update isn't safe unless the
 882                          * size change is logged.
 883                          *
 884                          * Force the log if we've committed a transaction
 885                          * against the inode or if someone else has and
 886                          * the commit record hasn't gone to disk (e.g.
 887                          * the inode is pinned).  This guarantees that
 888                          * all changes affecting the inode are permanent
 889                          * when we return.
 890                          */
 891
 892                         xfs_inode_log_item_t *iip;
 893                         xfs_lsn_t lsn;
 894
 895                         iip = xip->i_itemp;
 896                         if (iip && iip->ili_last_lsn) {
 897                                 lsn = iip->ili_last_lsn;
 898                                 xfs_log_force(mp, lsn,
 899                                                 XFS_LOG_FORCE | XFS_LOG_SYNC);
 900                         } else if (xfs_ipincount(xip) > 0) {
 901                                 xfs_log_force(mp, (xfs_lsn_t)0,
 902                                                 XFS_LOG_FORCE | XFS_LOG_SYNC);
 903                         }
 904
 905                 } else {
 906                         xfs_trans_t     *tp;
 907
 908                         /*
 909                          * O_SYNC or O_DSYNC _with_ a size update are handled
 910                          * the same way.
 911                          *
 912                          * If the write was synchronous then we need to make
 913                          * sure that the inode modification time is permanent.
 914                          * We'll have updated the timestamp above, so here
 915                          * we use a synchronous transaction to log the inode.
 916                          * It's not fast, but it's necessary.
 917                          *
 918                          * If this a dsync write and the size got changed
 919                          * non-transactionally, then we need to ensure that
 920                          * the size change gets logged in a synchronous
 921                          * transaction.
 922                          */
 923
 924                         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
 925                         if ((error = xfs_trans_reserve(tp, 0,
 926                                                       XFS_SWRITE_LOG_RES(mp),
 927                                                       0, 0, 0))) {
 928                                 /* Transaction reserve failed */
 929                                 xfs_trans_cancel(tp, 0);
 930                         } else {
 931                                 /* Transaction reserve successful */
 932                                 xfs_ilock(xip, XFS_ILOCK_EXCL);
 933                                 xfs_trans_ijoin(tp, xip, XFS_ILOCK_EXCL);
 934                                 xfs_trans_ihold(tp, xip);
 935                                 xfs_trans_log_inode(tp, xip, XFS_ILOG_CORE);
 936                                 xfs_trans_set_sync(tp);
 937                                 error = xfs_trans_commit(tp, 0, NULL);
 938                                 xfs_iunlock(xip, XFS_ILOCK_EXCL);
 939                         }
 940                 }
 941         } /* (ioflags & O_SYNC) */
 942
 943         xfs_rwunlock(bdp, locktype);
 944         return(ret);
 945 }
 946
 947 /*
 948  * All xfs metadata buffers except log state machine buffers
 949  * get this attached as their b_bdstrat callback function.
 950  * This is so that we can catch a buffer
 951  * after prematurely unpinning it to forcibly shutdown the filesystem.
 952  */
 953 int
 954 xfs_bdstrat_cb(struct xfs_buf *bp)
 955 {
 956         xfs_mount_t     *mp;
 957
 958         mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
 959         if (!XFS_FORCED_SHUTDOWN(mp)) {
 960                 pagebuf_iorequest(bp);
 961                 return 0;
 962         } else {
 963                 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
 964                 /*
 965                  * Metadata write that didn't get logged but
 966                  * written delayed anyway. These aren't associated
 967                  * with a transaction, and can be ignored.
 968                  */
 969                 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
 970                     (XFS_BUF_ISREAD(bp)) == 0)
 971                         return (xfs_bioerror_relse(bp));
 972                 else
 973                         return (xfs_bioerror(bp));
 974         }
 975 }
 976
 977
 978 int
 979 xfs_bmap(bhv_desc_t     *bdp,
 980         xfs_off_t       offset,
 981         ssize_t         count,
 982         int             flags,
 983         xfs_iomap_t     *iomapp,
 984         int             *niomaps)
 985 {
 986         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
 987         xfs_iocore_t    *io = &ip->i_iocore;
 988
 989         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 990         ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) ==
 991                ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0));
 992
 993         return xfs_iomap(io, offset, count, flags, iomapp, niomaps);
 994 }
 995
 996 /*
 997  * Wrapper around bdstrat so that we can stop data
 998  * from going to disk in case we are shutting down the filesystem.
 999  * Typically user data goes thru this path; one of the exceptions
1000  * is the superblock.
1001  */
1002 int
1003 xfsbdstrat(
1004         struct xfs_mount        *mp,
1005         struct xfs_buf          *bp)
1006 {
1007         ASSERT(mp);
1008         if (!XFS_FORCED_SHUTDOWN(mp)) {
1009                 /* Grio redirection would go here
1010                  * if (XFS_BUF_IS_GRIO(bp)) {
1011                  */
1012
1013                 pagebuf_iorequest(bp);
1014                 return 0;
1015         }
1016
1017         xfs_buftrace("XFSBDSTRAT IOERROR", bp);
1018         return (xfs_bioerror_relse(bp));
1019 }
1020
1021 /*
1022  * If the underlying (data/log/rt) device is readonly, there are some
1023  * operations that cannot proceed.
1024  */
1025 int
1026 xfs_dev_is_read_only(
1027         xfs_mount_t             *mp,
1028         char                    *message)
1029 {
1030         if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
1031             xfs_readonly_buftarg(mp->m_logdev_targp) ||
1032             (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
1033                 cmn_err(CE_NOTE,
1034                         "XFS: %s required on read-only device.", message);
1035                 cmn_err(CE_NOTE,
1036                         "XFS: write access unavailable, cannot proceed.");
1037                 return EROFS;
1038         }
1039         return 0;
1040 }