2 * Copyright (C) 2005-2011 Junjiro R. Okajima
4 * This program, aufs is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 * file and vm operations
23 #include <linux/file.h>
24 #include <linux/fs_stack.h>
25 #include <linux/mman.h>
27 #include <linux/security.h>
30 int au_do_open_nondir(struct file *file, int flags)
35 struct dentry *dentry;
36 struct au_finfo *finfo;
38 FiMustWriteLock(file);
40 dentry = file->f_dentry;
41 err = au_d_alive(dentry);
46 memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop));
47 finfo->fi_hvmop = NULL;
48 bindex = au_dbstart(dentry);
49 h_file = au_h_open(dentry, bindex, flags, file);
51 err = PTR_ERR(h_file);
53 au_set_fbstart(file, bindex);
54 au_set_h_fptr(file, bindex, h_file);
55 au_update_figen(file);
56 /* todo: necessary? */
57 /* file->f_ra = h_file->f_ra; */
64 static int aufs_open_nondir(struct inode *inode __maybe_unused,
68 struct super_block *sb;
70 AuDbg("%.*s, f_ flags 0x%x, f_mode 0x%x\n",
71 AuDLNPair(file->f_dentry), vfsub_file_flags(file),
74 sb = file->f_dentry->d_sb;
75 si_read_lock(sb, AuLock_FLUSH);
76 err = au_do_open(file, au_do_open_nondir, /*fidir*/NULL);
81 int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file)
83 struct au_finfo *finfo;
87 bindex = finfo->fi_btop;
89 /* remove me from sb->s_files */
90 file_sb_list_del(file);
91 au_set_h_fptr(file, bindex, NULL);
98 /* ---------------------------------------------------------------------- */
100 static int au_do_flush_nondir(struct file *file, fl_owner_t id)
106 h_file = au_hf_top(file);
108 err = vfsub_flush(h_file, id);
112 static int aufs_flush_nondir(struct file *file, fl_owner_t id)
114 return au_do_flush(file, id, au_do_flush_nondir);
117 /* ---------------------------------------------------------------------- */
119 static ssize_t aufs_read(struct file *file, char __user *buf, size_t count,
123 struct dentry *dentry;
125 struct super_block *sb;
127 dentry = file->f_dentry;
129 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
130 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
134 h_file = au_hf_top(file);
135 err = vfsub_read_u(h_file, buf, count, ppos);
136 /* todo: necessary? */
137 /* file->f_ra = h_file->f_ra; */
138 fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode);
140 di_read_unlock(dentry, AuLock_IR);
141 fi_read_unlock(file);
149 * it locks both of i_mutex and si_rwsem for read in safe.
150 * if the plink maintenance mode continues forever (that is the problem),
153 static void au_mtx_and_read_lock(struct inode *inode)
156 struct super_block *sb = inode->i_sb;
159 mutex_lock(&inode->i_mutex);
160 err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
163 mutex_unlock(&inode->i_mutex);
164 si_read_lock(sb, AuLock_NOPLMW);
169 static ssize_t aufs_write(struct file *file, const char __user *ubuf,
170 size_t count, loff_t *ppos)
174 struct dentry *dentry;
177 char __user *buf = (char __user *)ubuf;
179 dentry = file->f_dentry;
180 inode = dentry->d_inode;
181 au_mtx_and_read_lock(inode);
183 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
187 err = au_ready_to_write(file, -1, &pin);
188 di_downgrade_lock(dentry, AuLock_IR);
192 h_file = au_hf_top(file);
194 err = vfsub_write_u(h_file, buf, count, ppos);
195 au_cpup_attr_timesizes(inode);
196 inode->i_mode = h_file->f_dentry->d_inode->i_mode;
199 di_read_unlock(dentry, AuLock_IR);
200 fi_write_unlock(file);
202 si_read_unlock(inode->i_sb);
203 mutex_unlock(&inode->i_mutex);
207 static ssize_t au_do_aio(struct file *h_file, int rw, struct kiocb *kio,
208 const struct iovec *iov, unsigned long nv, loff_t pos)
212 ssize_t (*func)(struct kiocb *, const struct iovec *, unsigned long,
215 err = security_file_permission(h_file, rw);
222 func = h_file->f_op->aio_read;
223 else if (rw == MAY_WRITE)
224 func = h_file->f_op->aio_write;
227 kio->ki_filp = h_file;
228 err = func(kio, iov, nv, pos);
231 /* currently there is no such fs */
238 static ssize_t aufs_aio_read(struct kiocb *kio, const struct iovec *iov,
239 unsigned long nv, loff_t pos)
242 struct file *file, *h_file;
243 struct dentry *dentry;
244 struct super_block *sb;
247 dentry = file->f_dentry;
249 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
250 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
254 h_file = au_hf_top(file);
255 err = au_do_aio(h_file, MAY_READ, kio, iov, nv, pos);
256 /* todo: necessary? */
257 /* file->f_ra = h_file->f_ra; */
258 fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode);
259 di_read_unlock(dentry, AuLock_IR);
260 fi_read_unlock(file);
267 static ssize_t aufs_aio_write(struct kiocb *kio, const struct iovec *iov,
268 unsigned long nv, loff_t pos)
272 struct dentry *dentry;
274 struct file *file, *h_file;
277 dentry = file->f_dentry;
278 inode = dentry->d_inode;
279 au_mtx_and_read_lock(inode);
281 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
285 err = au_ready_to_write(file, -1, &pin);
286 di_downgrade_lock(dentry, AuLock_IR);
291 h_file = au_hf_top(file);
292 err = au_do_aio(h_file, MAY_WRITE, kio, iov, nv, pos);
293 au_cpup_attr_timesizes(inode);
294 inode->i_mode = h_file->f_dentry->d_inode->i_mode;
297 di_read_unlock(dentry, AuLock_IR);
298 fi_write_unlock(file);
300 si_read_unlock(inode->i_sb);
301 mutex_unlock(&inode->i_mutex);
305 static ssize_t aufs_splice_read(struct file *file, loff_t *ppos,
306 struct pipe_inode_info *pipe, size_t len,
311 struct dentry *dentry;
312 struct super_block *sb;
314 dentry = file->f_dentry;
316 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
317 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
322 h_file = au_hf_top(file);
323 if (au_test_loopback_kthread()) {
324 file->f_mapping = h_file->f_mapping;
325 smp_mb(); /* unnecessary? */
327 err = vfsub_splice_to(h_file, ppos, pipe, len, flags);
328 /* todo: necessasry? */
329 /* file->f_ra = h_file->f_ra; */
330 fsstack_copy_attr_atime(dentry->d_inode, h_file->f_dentry->d_inode);
332 di_read_unlock(dentry, AuLock_IR);
333 fi_read_unlock(file);
341 aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos,
342 size_t len, unsigned int flags)
346 struct dentry *dentry;
350 dentry = file->f_dentry;
351 inode = dentry->d_inode;
352 au_mtx_and_read_lock(inode);
353 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
357 err = au_ready_to_write(file, -1, &pin);
358 di_downgrade_lock(dentry, AuLock_IR);
362 h_file = au_hf_top(file);
364 err = vfsub_splice_from(pipe, h_file, ppos, len, flags);
365 au_cpup_attr_timesizes(inode);
366 inode->i_mode = h_file->f_dentry->d_inode->i_mode;
369 di_read_unlock(dentry, AuLock_IR);
370 fi_write_unlock(file);
372 si_read_unlock(inode->i_sb);
373 mutex_unlock(&inode->i_mutex);
377 /* ---------------------------------------------------------------------- */
379 static struct file *au_safe_file(struct vm_area_struct *vma)
384 if (au_fi(file) && au_test_aufs(file->f_dentry->d_sb))
389 static void au_reset_file(struct vm_area_struct *vma, struct file *file)
392 /* smp_mb(); */ /* flush vm_file */
395 static int aufs_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
398 static DECLARE_WAIT_QUEUE_HEAD(wq);
399 struct file *file, *h_file;
400 struct au_finfo *finfo;
402 /* todo: non-robr mode, user vm_file as it is? */
403 wait_event(wq, (file = au_safe_file(vma)));
405 /* do not revalidate, no si lock */
407 AuDebugOn(finfo->fi_hdir);
408 h_file = finfo->fi_htop.hf_file;
409 AuDebugOn(!h_file || !finfo->fi_hvmop);
411 mutex_lock(&finfo->fi_vm_mtx);
412 vma->vm_file = h_file;
413 err = finfo->fi_hvmop->fault(vma, vmf);
414 /* todo: necessary? */
415 /* file->f_ra = h_file->f_ra; */
416 au_reset_file(vma, file);
417 mutex_unlock(&finfo->fi_vm_mtx);
418 #if 0 /* def CONFIG_SMP */
419 /* wake_up_nr(&wq, online_cpu - 1); */
428 static int aufs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
431 static DECLARE_WAIT_QUEUE_HEAD(wq);
432 struct file *file, *h_file;
433 struct au_finfo *finfo;
435 wait_event(wq, (file = au_safe_file(vma)));
438 AuDebugOn(finfo->fi_hdir);
439 h_file = finfo->fi_htop.hf_file;
440 AuDebugOn(!h_file || !finfo->fi_hvmop);
442 mutex_lock(&finfo->fi_vm_mtx);
443 vma->vm_file = h_file;
444 err = finfo->fi_hvmop->page_mkwrite(vma, vmf);
445 au_reset_file(vma, file);
446 mutex_unlock(&finfo->fi_vm_mtx);
452 static void aufs_vm_close(struct vm_area_struct *vma)
454 static DECLARE_WAIT_QUEUE_HEAD(wq);
455 struct file *file, *h_file;
456 struct au_finfo *finfo;
458 wait_event(wq, (file = au_safe_file(vma)));
461 AuDebugOn(finfo->fi_hdir);
462 h_file = finfo->fi_htop.hf_file;
463 AuDebugOn(!h_file || !finfo->fi_hvmop);
465 mutex_lock(&finfo->fi_vm_mtx);
466 vma->vm_file = h_file;
467 finfo->fi_hvmop->close(vma);
468 au_reset_file(vma, file);
469 mutex_unlock(&finfo->fi_vm_mtx);
473 const struct vm_operations_struct aufs_vm_ops = {
474 .close = aufs_vm_close,
476 .page_mkwrite = aufs_page_mkwrite
479 /* ---------------------------------------------------------------------- */
481 /* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
482 #define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b)
484 static unsigned long au_arch_prot_conv(unsigned long flags)
486 /* currently ppc64 only */
488 /* cf. linux/arch/powerpc/include/asm/mman.h */
489 AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO);
490 return AuConv_VM_PROT(flags, SAO);
492 AuDebugOn(arch_calc_vm_prot_bits(-1));
497 static unsigned long au_prot_conv(unsigned long flags)
499 return AuConv_VM_PROT(flags, READ)
500 | AuConv_VM_PROT(flags, WRITE)
501 | AuConv_VM_PROT(flags, EXEC)
502 | au_arch_prot_conv(flags);
505 /* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
506 #define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b)
508 static unsigned long au_flag_conv(unsigned long flags)
510 return AuConv_VM_MAP(flags, GROWSDOWN)
511 | AuConv_VM_MAP(flags, DENYWRITE)
512 | AuConv_VM_MAP(flags, EXECUTABLE)
513 | AuConv_VM_MAP(flags, LOCKED);
516 static struct vm_operations_struct *
517 au_hvmop(struct file *h_file, struct vm_area_struct *vma, unsigned long *flags)
519 struct vm_operations_struct *h_vmop;
523 h_vmop = ERR_PTR(-ENODEV);
524 if (!h_file->f_op || !h_file->f_op->mmap)
527 prot = au_prot_conv(vma->vm_flags);
528 err = security_file_mmap(h_file, /*reqprot*/prot, prot,
529 au_flag_conv(vma->vm_flags), vma->vm_start, 0);
530 h_vmop = ERR_PTR(err);
534 err = h_file->f_op->mmap(h_file, vma);
535 h_vmop = ERR_PTR(err);
539 /* oops, it became 'const' */
540 h_vmop = (struct vm_operations_struct *)vma->vm_ops;
541 *flags = vma->vm_flags;
542 err = do_munmap(current->mm, vma->vm_start,
543 vma->vm_end - vma->vm_start);
545 AuIOErr("failed internal unmapping %.*s, %d\n",
546 AuDLNPair(h_file->f_dentry), err);
547 h_vmop = ERR_PTR(-EIO);
555 * This is another ugly approach to keep the lock order, particularly
556 * mm->mmap_sem and aufs rwsem. The previous approach was reverted and you can
557 * find it in git-log, if you want.
559 * native readdir: i_mutex, copy_to_user, mmap_sem
560 * aufs readdir: i_mutex, rwsem, nested-i_mutex, copy_to_user, mmap_sem
562 * Before aufs_mmap() mmap_sem is acquired already, but aufs_mmap() has to
563 * acquire aufs rwsem. It introduces a circular locking dependency.
564 * To address this problem, aufs_mmap() delegates the part which requires aufs
565 * rwsem to its internal workqueue.
568 /* very ugly approach */
571 struct au_mmap_pre_args {
574 struct vm_area_struct *vma;
579 struct au_branch *br;
583 static int au_mmap_pre(struct file *file, struct vm_area_struct *vma,
584 struct file **h_file, struct au_branch **br,
588 aufs_bindex_t bstart;
589 const unsigned char wlock
590 = !!(file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
591 struct dentry *dentry;
592 struct super_block *sb;
594 dentry = file->f_dentry;
596 si_read_lock(sb, AuLock_NOPLMW);
597 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
601 *mmapped = !!au_test_mmapped(file);
605 err = au_ready_to_write(file, -1, &pin);
606 di_write_unlock(dentry);
611 di_write_unlock(dentry);
612 bstart = au_fbstart(file);
613 *br = au_sbr(sb, bstart);
614 *h_file = au_hf_top(file);
616 au_fi_mmap_lock(file);
619 fi_write_unlock(file);
625 static void au_call_mmap_pre(void *args)
627 struct au_mmap_pre_args *a = args;
628 *a->errp = au_mmap_pre(a->file, a->vma, &a->h_file, &a->br,
632 static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
635 unsigned long h_vmflags;
636 struct au_finfo *finfo;
637 struct dentry *h_dentry;
638 struct vm_operations_struct *h_vmop, *vmop;
639 struct au_mmap_pre_args args = {
645 wkq_err = au_wkq_wait_pre(au_call_mmap_pre, &args);
646 if (unlikely(wkq_err))
651 mutex_set_owner(&finfo->fi_mmap);
653 h_dentry = args.h_file->f_dentry;
654 if (!args.mmapped && au_test_fs_bad_mapping(h_dentry->d_sb)) {
656 * by this assignment, f_mapping will differs from aufs inode
658 * if someone else mixes the use of f_dentry->d_inode and
659 * f_mapping->host, then a problem may arise.
661 file->f_mapping = args.h_file->f_mapping;
664 /* always try this internal mmap to get vma flags */
665 h_vmflags = 0; /* gcc warning */
666 h_vmop = au_hvmop(args.h_file, vma, &h_vmflags);
667 err = PTR_ERR(h_vmop);
670 AuDebugOn(args.mmapped && h_vmop != finfo->fi_hvmop);
672 vmop = (void *)au_dy_vmop(file, args.br, h_vmop);
678 * unnecessary to handle MAP_DENYWRITE and deny_write_access()?
679 * currently MAP_DENYWRITE from userspace is ignored, but elf loader
680 * sets it. when FMODE_EXEC is set (by open_exec() or sys_uselib()),
681 * both of the aufs file and the lower file is deny_write_access()-ed.
682 * finally I hope we can skip handlling MAP_DENYWRITE here.
684 err = generic_file_mmap(file, vma);
689 vma->vm_flags = h_vmflags;
691 finfo->fi_hvmop = h_vmop;
693 vfsub_file_accessed(args.h_file);
694 /* update without lock, I don't think it a problem */
695 fsstack_copy_attr_atime(file->f_dentry->d_inode, h_dentry->d_inode);
698 au_fi_mmap_unlock(file);
704 /* ---------------------------------------------------------------------- */
706 static int aufs_fsync_nondir(struct file *file, int datasync)
710 struct dentry *dentry;
713 struct super_block *sb;
715 dentry = file->f_dentry;
716 inode = dentry->d_inode;
717 IMustLock(file->f_mapping->host);
718 if (inode != file->f_mapping->host) {
719 mutex_unlock(&file->f_mapping->host->i_mutex);
720 mutex_lock(&inode->i_mutex);
725 err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
729 err = 0; /* -EBADF; */ /* posix? */
730 if (unlikely(!(file->f_mode & FMODE_WRITE)))
732 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
736 err = au_ready_to_write(file, -1, &pin);
737 di_downgrade_lock(dentry, AuLock_IR);
743 h_file = au_hf_top(file);
744 if (h_file->f_op && h_file->f_op->fsync) {
748 * no filemap_fdatawrite() since aufs file has no its own
751 h_mtx = &h_file->f_dentry->d_inode->i_mutex;
752 mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
753 err = h_file->f_op->fsync(h_file, datasync);
755 vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL);
757 au_cpup_attr_timesizes(inode);
762 di_read_unlock(dentry, AuLock_IR);
763 fi_write_unlock(file);
767 if (inode != file->f_mapping->host) {
768 mutex_unlock(&inode->i_mutex);
769 mutex_lock(&file->f_mapping->host->i_mutex);
774 /* no one supports this operation, currently */
776 static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync)
780 struct dentry *dentry;
782 struct file *file, *h_file;
785 dentry = file->f_dentry;
786 inode = dentry->d_inode;
787 au_mtx_and_read_lock(inode);
789 err = 0; /* -EBADF; */ /* posix? */
790 if (unlikely(!(file->f_mode & FMODE_WRITE)))
792 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
796 err = au_ready_to_write(file, -1, &pin);
797 di_downgrade_lock(dentry, AuLock_IR);
803 h_file = au_hf_top(file);
804 if (h_file->f_op && h_file->f_op->aio_fsync) {
808 h_d = h_file->f_dentry;
809 h_mtx = &h_d->d_inode->i_mutex;
810 if (!is_sync_kiocb(kio)) {
814 kio->ki_filp = h_file;
815 err = h_file->f_op->aio_fsync(kio, datasync);
816 mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
818 vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL);
820 au_cpup_attr_timesizes(inode);
825 di_read_unlock(dentry, AuLock_IR);
826 fi_write_unlock(file);
828 si_read_unlock(inode->sb);
829 mutex_unlock(&inode->i_mutex);
834 static int aufs_fasync(int fd, struct file *file, int flag)
838 struct dentry *dentry;
839 struct super_block *sb;
841 dentry = file->f_dentry;
843 si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
844 err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
848 h_file = au_hf_top(file);
849 if (h_file->f_op && h_file->f_op->fasync)
850 err = h_file->f_op->fasync(fd, h_file, flag);
852 di_read_unlock(dentry, AuLock_IR);
853 fi_read_unlock(file);
860 /* ---------------------------------------------------------------------- */
862 /* no one supports this operation, currently */
864 static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset,
865 size_t len, loff_t *pos , int more)
870 /* ---------------------------------------------------------------------- */
872 const struct file_operations aufs_file_fop = {
873 .owner = THIS_MODULE,
875 * while generic_file_llseek/_unlocked() don't use BKL,
876 * don't use it since it operates file->f_mapping->host.
877 * in aufs, it may be a real file and may confuse users by UDBA.
879 /* .llseek = generic_file_llseek, */
880 .llseek = default_llseek,
884 .aio_read = aufs_aio_read,
885 .aio_write = aufs_aio_write,
886 #ifdef CONFIG_AUFS_POLL
889 .unlocked_ioctl = aufs_ioctl_nondir,
891 .compat_ioctl = aufs_ioctl_nondir, /* same */
894 .open = aufs_open_nondir,
895 .flush = aufs_flush_nondir,
896 .release = aufs_release_nondir,
897 .fsync = aufs_fsync_nondir,
898 /* .aio_fsync = aufs_aio_fsync_nondir, */
899 .fasync = aufs_fasync,
900 /* .sendpage = aufs_sendpage, */
901 .splice_write = aufs_splice_write,
902 .splice_read = aufs_splice_read,
904 .aio_splice_write = aufs_aio_splice_write,
905 .aio_splice_read = aufs_aio_splice_read