#include <linux/errno.h>
#include <linux/time.h>
#include <linux/aio_abi.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
+#ifdef CONFIG_EPOLL
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#endif
+
#if DEBUG > 1
#define dprintk printk
#else
put_page(info->ring_pages[i]);
if (info->mmap_size) {
- down_write(&ctx->mm->mmap_sem);
- do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
- up_write(&ctx->mm->mmap_sem);
+ BUG_ON(ctx->mm != current->mm);
+ vm_munmap(info->mmap_base, info->mmap_size);
}
if (info->ring_pages && info->ring_pages != info->internal_pages)
info->nr = nr_events; /* trusted copy */
- ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+ ring = kmap_atomic(info->ring_pages[0]);
ring->nr = nr_events; /* user copy */
ring->id = ctx->user_id;
ring->head = ring->tail = 0;
ring->compat_features = AIO_RING_COMPAT_FEATURES;
ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
ring->header_length = sizeof(struct aio_ring);
- kunmap_atomic(ring, KM_USER0);
+ kunmap_atomic(ring);
return 0;
}
/* aio_ring_event: returns a pointer to the event at the given index from
- * kmap_atomic(, km). Release the pointer with put_aio_ring_event();
+ * kmap_atomic(). Release the pointer with put_aio_ring_event();
*/
#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-#define aio_ring_event(info, nr, km) ({ \
+#define aio_ring_event(info, nr) ({ \
unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
struct io_event *__event; \
__event = kmap_atomic( \
- (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
+ (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE]); \
__event += pos % AIO_EVENTS_PER_PAGE; \
__event; \
})
-#define put_aio_ring_event(event, km) do { \
+#define put_aio_ring_event(event) do { \
struct io_event *__event = (event); \
(void)__event; \
- kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
+ kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK)); \
} while(0)
static void ctx_rcu_free(struct rcu_head *head)
{
struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
- unsigned nr_events = ctx->max_reqs;
-
kmem_cache_free(kioctx_cachep, ctx);
-
- if (nr_events) {
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - nr_events > aio_nr);
- aio_nr -= nr_events;
- spin_unlock(&aio_nr_lock);
- }
}
/* __put_ioctx
*/
static void __put_ioctx(struct kioctx *ctx)
{
+ unsigned nr_events = ctx->max_reqs;
BUG_ON(ctx->reqs_active);
- cancel_delayed_work(&ctx->wq);
- cancel_work_sync(&ctx->wq.work);
+ cancel_delayed_work_sync(&ctx->wq);
aio_free_ring(ctx);
mmdrop(ctx->mm);
ctx->mm = NULL;
+ if (nr_events) {
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - nr_events > aio_nr);
+ aio_nr -= nr_events;
+ spin_unlock(&aio_nr_lock);
+ }
pr_debug("__put_ioctx: freeing %p\n", ctx);
call_rcu(&ctx->rcu_head, ctx_rcu_free);
}
-static inline void get_ioctx(struct kioctx *kioctx)
-{
- BUG_ON(atomic_read(&kioctx->users) <= 0);
- atomic_inc(&kioctx->users);
-}
-
static inline int try_get_ioctx(struct kioctx *kioctx)
{
return atomic_inc_not_zero(&kioctx->users);
{
struct mm_struct *mm;
struct kioctx *ctx;
- int did_sync = 0;
+ int err = -ENOMEM;
/* Prevent overflows */
if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
return ERR_PTR(-EINVAL);
}
- if ((unsigned long)nr_events > aio_max_nr)
+ if (!nr_events || (unsigned long)nr_events > aio_max_nr)
return ERR_PTR(-EAGAIN);
ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
goto out_freectx;
/* limit the number of system wide aios */
- do {
- spin_lock_bh(&aio_nr_lock);
- if (aio_nr + nr_events > aio_max_nr ||
- aio_nr + nr_events < aio_nr)
- ctx->max_reqs = 0;
- else
- aio_nr += ctx->max_reqs;
- spin_unlock_bh(&aio_nr_lock);
- if (ctx->max_reqs || did_sync)
- break;
-
- /* wait for rcu callbacks to have completed before giving up */
- synchronize_rcu();
- did_sync = 1;
- ctx->max_reqs = nr_events;
- } while (1);
-
- if (ctx->max_reqs == 0)
+ spin_lock(&aio_nr_lock);
+ if (aio_nr + nr_events > aio_max_nr ||
+ aio_nr + nr_events < aio_nr) {
+ spin_unlock(&aio_nr_lock);
goto out_cleanup;
+ }
+ aio_nr += ctx->max_reqs;
+ spin_unlock(&aio_nr_lock);
/* now link into global list. */
spin_lock(&mm->ioctx_lock);
return ctx;
out_cleanup:
- __put_ioctx(ctx);
- return ERR_PTR(-EAGAIN);
-
+ err = -EAGAIN;
+ aio_free_ring(ctx);
out_freectx:
mmdrop(mm);
kmem_cache_free(kioctx_cachep, ctx);
- ctx = ERR_PTR(-ENOMEM);
-
- dprintk("aio: error allocating ioctx %p\n", ctx);
- return ctx;
+ dprintk("aio: error allocating ioctx %d\n", err);
+ return ERR_PTR(err);
}
-/* aio_cancel_all
+/* kill_ctx
* Cancels all outstanding aio requests on an aio context. Used
* when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void aio_cancel_all(struct kioctx *ctx)
+static void kill_ctx(struct kioctx *ctx)
{
int (*cancel)(struct kiocb *, struct io_event *);
+ struct task_struct *tsk = current;
+ DECLARE_WAITQUEUE(wait, tsk);
struct io_event res;
+
spin_lock_irq(&ctx->ctx_lock);
ctx->dead = 1;
while (!list_empty(&ctx->active_reqs)) {
spin_lock_irq(&ctx->ctx_lock);
}
}
- spin_unlock_irq(&ctx->ctx_lock);
-}
-static void wait_for_all_aios(struct kioctx *ctx)
-{
- struct task_struct *tsk = current;
- DECLARE_WAITQUEUE(wait, tsk);
-
- spin_lock_irq(&ctx->ctx_lock);
if (!ctx->reqs_active)
goto out;
ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
hlist_del_rcu(&ctx->list);
- aio_cancel_all(ctx);
-
- wait_for_all_aios(ctx);
- /*
- * Ensure we don't leave the ctx on the aio_wq
- */
- cancel_work_sync(&ctx->wq.work);
+ kill_ctx(ctx);
if (1 != atomic_read(&ctx->users))
printk(KERN_DEBUG
"exit_aio:ioctx still alive: %d %d %d\n",
atomic_read(&ctx->users), ctx->dead,
ctx->reqs_active);
+ /*
+ * We don't need to bother with munmap() here -
+ * exit_mmap(mm) is coming and it'll unmap everything.
+ * Since aio_free_ring() uses non-zero ->mmap_size
+ * as indicator that it needs to unmap the area,
+ * just set it to 0; aio_free_ring() is the only
+ * place that uses ->mmap_size, so it's safe.
+ * That way we get all munmap done to current->mm -
+ * all other callers have ctx->mm == current->mm.
+ */
+ ctx->ring_info.mmap_size = 0;
put_ioctx(ctx);
}
}
fput(req->ki_filp);
/* Link the iocb into the context's free list */
+ rcu_read_lock();
spin_lock_irq(&ctx->ctx_lock);
really_put_req(ctx, req);
+ /*
+ * at that point ctx might've been killed, but actual
+ * freeing is RCU'd
+ */
spin_unlock_irq(&ctx->ctx_lock);
+ rcu_read_unlock();
- put_ioctx(ctx);
spin_lock_irq(&fput_lock);
}
spin_unlock_irq(&fput_lock);
* this function will be executed w/out any aio kthread wakeup.
*/
if (unlikely(!fput_atomic(req->ki_filp))) {
- get_ioctx(ctx);
spin_lock(&fput_lock);
list_add(&req->ki_list, &fput_head);
spin_unlock(&fput_lock);
unuse_mm(mm);
set_fs(oldfs);
/*
- * we're in a worker thread already, don't use queue_delayed_work,
+ * we're in a worker thread already; no point using non-zero delay
*/
if (requeue)
queue_delayed_work(aio_wq, &ctx->wq, 0);
if (kiocbIsCancelled(iocb))
goto put_rq;
- ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
+ ring = kmap_atomic(info->ring_pages[0]);
tail = info->tail;
- event = aio_ring_event(info, tail, KM_IRQ0);
+ event = aio_ring_event(info, tail);
if (++tail >= info->nr)
tail = 0;
info->tail = tail;
ring->tail = tail;
- put_aio_ring_event(event, KM_IRQ0);
- kunmap_atomic(ring, KM_IRQ1);
+ put_aio_ring_event(event);
+ kunmap_atomic(ring);
pr_debug("added to ring %p at [%lu]\n", iocb, tail);
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
+#ifdef CONFIG_EPOLL
+ if (ctx->file && waitqueue_active(&ctx->poll_wait))
+ wake_up(&ctx->poll_wait);
+#endif
+
spin_unlock_irqrestore(&ctx->ctx_lock, flags);
return ret;
}
/* aio_read_evt
* Pull an event off of the ioctx's event ring. Returns the number of
* events fetched (0 or 1 ;-)
+ * If ent parameter is 0, just returns the number of events that would
+ * be fetched.
* FIXME: make this use cmpxchg.
* TODO: make the ringbuffer user mmap()able (requires FIXME).
*/
unsigned long head;
int ret = 0;
- ring = kmap_atomic(info->ring_pages[0], KM_USER0);
+ ring = kmap_atomic(info->ring_pages[0]);
dprintk("in aio_read_evt h%lu t%lu m%lu\n",
(unsigned long)ring->head, (unsigned long)ring->tail,
(unsigned long)ring->nr);
head = ring->head % info->nr;
if (head != ring->tail) {
- struct io_event *evp = aio_ring_event(info, head, KM_USER1);
- *ent = *evp;
- head = (head + 1) % info->nr;
- smp_mb(); /* finish reading the event before updatng the head */
- ring->head = head;
- ret = 1;
- put_aio_ring_event(evp, KM_USER1);
+ if (ent) { /* event requested */
+ struct io_event *evp = aio_ring_event(info, head);
+ *ent = *evp;
+ head = (head + 1) % info->nr;
+ /* finish reading the event before updatng the head */
+ smp_mb();
+ ring->head = head;
+ ret = 1;
+ put_aio_ring_event(evp);
+ } else /* only need to know availability */
+ ret = 1;
}
spin_unlock(&info->ring_lock);
out:
- kunmap_atomic(ring, KM_USER0);
+ kunmap_atomic(ring);
dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
(unsigned long)ring->head, (unsigned long)ring->tail);
return ret;
if (likely(!was_dead))
put_ioctx(ioctx); /* twice for the list */
- aio_cancel_all(ioctx);
- wait_for_all_aios(ioctx);
+ kill_ctx(ioctx);
+
+#ifdef CONFIG_EPOLL
+ /* forget the poll file, but it's up to the user to close it */
+ if (ioctx->file) {
+ fput(ioctx->file);
+ ioctx->file->private_data = 0;
+ ioctx->file = 0;
+ }
+#endif
/*
* Wake up any waiters. The setting of ctx->dead must be seen
* locking done by the above calls to ensure this consistency.
*/
wake_up_all(&ioctx->wait);
- put_ioctx(ioctx); /* once for the lookup */
}
+#ifdef CONFIG_EPOLL
+
+static int aio_queue_fd_close(struct inode *inode, struct file *file)
+{
+ struct kioctx *ioctx = file->private_data;
+ if (ioctx) {
+ file->private_data = 0;
+ spin_lock_irq(&ioctx->ctx_lock);
+ ioctx->file = 0;
+ spin_unlock_irq(&ioctx->ctx_lock);
+ fput(file);
+ }
+ return 0;
+}
+
+static unsigned int aio_queue_fd_poll(struct file *file, poll_table *wait)
+{ unsigned int pollflags = 0;
+ struct kioctx *ioctx = file->private_data;
+
+ if (ioctx) {
+
+ spin_lock_irq(&ioctx->ctx_lock);
+ /* Insert inside our poll wait queue */
+ poll_wait(file, &ioctx->poll_wait, wait);
+
+ /* Check our condition */
+ if (aio_read_evt(ioctx, 0))
+ pollflags = POLLIN | POLLRDNORM;
+ spin_unlock_irq(&ioctx->ctx_lock);
+ }
+
+ return pollflags;
+}
+
+static const struct file_operations aioq_fops = {
+ .release = aio_queue_fd_close,
+ .poll = aio_queue_fd_poll
+};
+
+/* make_aio_fd:
+ * Create a file descriptor that can be used to poll the event queue.
+ * Based on the excellent epoll code.
+ */
+
+static int make_aio_fd(struct kioctx *ioctx)
+{
+ int fd;
+ struct file *file;
+
+ fd = anon_inode_getfd("[aioq]", &aioq_fops, ioctx, 0);
+ if (fd < 0)
+ return fd;
+
+ /* associate the file with the IO context */
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+ file->private_data = ioctx;
+ ioctx->file = file;
+ init_waitqueue_head(&ioctx->poll_wait);
+ return fd;
+}
+#endif
+
/* sys_io_setup:
* Create an aio_context capable of receiving at least nr_events.
* ctxp must not point to an aio_context that already exists, and
* resources are available. May fail with -EFAULT if an invalid
* pointer is passed for ctxp. Will fail with -ENOSYS if not
* implemented.
+ *
+ * To request a selectable fd, the user context has to be initialized
+ * to 1, instead of 0, and the return value is the fd.
+ * This keeps the system call compatible, since a non-zero value
+ * was not allowed so far.
*/
SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
{
struct kioctx *ioctx = NULL;
unsigned long ctx;
long ret;
+ int make_fd = 0;
ret = get_user(ctx, ctxp);
if (unlikely(ret))
goto out;
ret = -EINVAL;
+#ifdef CONFIG_EPOLL
+ if (ctx == 1) {
+ make_fd = 1;
+ ctx = 0;
+ }
+#endif
if (unlikely(ctx || nr_events == 0)) {
pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
ctx, nr_events);
ret = PTR_ERR(ioctx);
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
- if (!ret) {
- put_ioctx(ioctx);
- return 0;
- }
- io_destroy(ioctx);
+#ifdef CONFIG_EPOLL
+ if (make_fd && !ret)
+ ret = make_aio_fd(ioctx);
+#endif
+ if (ret < 0)
+ io_destroy(ioctx);
+ put_ioctx(ioctx);
}
out:
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
io_destroy(ioctx);
+ put_ioctx(ioctx);
return 0;
}
pr_debug("EINVAL: io_destroy: invalid context id\n");