summaryrefslogtreecommitdiff
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-01-05 09:16:18 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-01-05 09:16:18 -0800
commita65981109f294ba7e64b33ad3b4575a4636fce66 (patch)
tree1061a49f11544e18775630938a8bc53920fa0421 /fs/eventpoll.c
parent3fed6ae4b027f9c93be18520f87bd06bdffd196b (diff)
parentb685a7350ae76bc0f388e24b36d06a63776c68ee (diff)
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - procfs updates - various misc bits - lib/ updates - epoll updates - autofs - fatfs - a few more MM bits * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (58 commits) mm/page_io.c: fix polled swap page in checkpatch: add Co-developed-by to signature tags docs: fix Co-Developed-by docs drivers/base/platform.c: kmemleak ignore a known leak fs: don't open code lru_to_page() fs/: remove caller signal_pending branch predictions mm/: remove caller signal_pending branch predictions arch/arc/mm/fault.c: remove caller signal_pending_branch predictions kernel/sched/: remove caller signal_pending branch predictions kernel/locking/mutex.c: remove caller signal_pending branch predictions mm: select HAVE_MOVE_PMD on x86 for faster mremap mm: speed up mremap by 20x on large regions mm: treewide: remove unused address argument from pte_alloc functions initramfs: cleanup incomplete rootfs scripts/gdb: fix lx-version string output kernel/kcov.c: mark write_comp_data() as notrace kernel/sysctl: add panic_print into sysctl panic: add options to print system info when panic happens bfs: extra sanity checking and static inode bitmap exec: separate MM_ANONPAGES and RLIMIT_STACK accounting ...
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c218
1 files changed, 118 insertions, 100 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7ebae39fbcb3..a5d219d920e7 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -381,7 +381,8 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
*/
static inline int ep_events_available(struct eventpoll *ep)
{
- return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
+ return !list_empty_careful(&ep->rdllist) ||
+ READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -471,7 +472,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
* no re-entered.
*
* @ncalls: Pointer to the nested_calls structure to be used for this call.
- * @max_nests: Maximum number of allowed nesting calls.
* @nproc: Nested call core function pointer.
* @priv: Opaque data to be passed to the @nproc callback.
* @cookie: Cookie to be used to identify this nested call.
@@ -480,7 +480,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
* Returns: Returns the code returned by the @nproc callback, or -1 if
* the maximum recursion limit has been exceeded.
*/
-static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+static int ep_call_nested(struct nested_calls *ncalls,
int (*nproc)(void *, void *, int), void *priv,
void *cookie, void *ctx)
{
@@ -499,7 +499,7 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
*/
list_for_each_entry(tncur, lsthead, llink) {
if (tncur->ctx == ctx &&
- (tncur->cookie == cookie || ++call_nests > max_nests)) {
+ (tncur->cookie == cookie || ++call_nests > EP_MAX_NESTS)) {
/*
* Ops ... loop detected or maximum nest level reached.
* We abort this wake by breaking the cycle itself.
@@ -573,7 +573,7 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
{
int this_cpu = get_cpu();
- ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+ ep_call_nested(&poll_safewake_ncalls,
ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);
put_cpu();
@@ -699,7 +699,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
*/
spin_lock_irq(&ep->wq.lock);
list_splice_init(&ep->rdllist, &txlist);
- ep->ovflist = NULL;
+ WRITE_ONCE(ep->ovflist, NULL);
spin_unlock_irq(&ep->wq.lock);
/*
@@ -713,7 +713,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* other events might have been queued by the poll callback.
* We re-insert them inside the main ready-list here.
*/
- for (nepi = ep->ovflist; (epi = nepi) != NULL;
+ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
/*
* We need to check if the item is already in the list.
@@ -731,7 +731,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep,
* releasing the lock, events will be queued in the normal way inside
* ep->rdllist.
*/
- ep->ovflist = EP_UNACTIVE_PTR;
+ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);
/*
* Quickly re-inject items left on "txlist".
@@ -1154,10 +1154,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* semantics). All the events that happen during that period of time are
* chained in ep->ovflist and requeued later on.
*/
- if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
+ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
if (epi->next == EP_UNACTIVE_PTR) {
- epi->next = ep->ovflist;
- ep->ovflist = epi;
+ epi->next = READ_ONCE(ep->ovflist);
+ WRITE_ONCE(ep->ovflist, epi);
if (epi->ws) {
/*
* Activate ep->ws since epi->ws may get
@@ -1333,7 +1333,6 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
}
} else {
error = ep_call_nested(&poll_loop_ncalls,
- EP_MAX_NESTS,
reverse_path_check_proc,
child_file, child_file,
current);
@@ -1367,7 +1366,7 @@ static int reverse_path_check(void)
/* let's call this for all tfiles */
list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
path_count_init();
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ error = ep_call_nested(&poll_loop_ncalls,
reverse_path_check_proc, current_file,
current_file, current);
if (error)
@@ -1626,21 +1625,24 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
{
struct ep_send_events_data *esed = priv;
__poll_t revents;
- struct epitem *epi;
- struct epoll_event __user *uevent;
+ struct epitem *epi, *tmp;
+ struct epoll_event __user *uevent = esed->events;
struct wakeup_source *ws;
poll_table pt;
init_poll_funcptr(&pt, NULL);
+ esed->res = 0;
/*
* We can loop without lock because we are passed a task private list.
* Items cannot vanish during the loop because ep_scan_ready_list() is
* holding "mtx" during this call.
*/
- for (esed->res = 0, uevent = esed->events;
- !list_empty(head) && esed->res < esed->maxevents;) {
- epi = list_first_entry(head, struct epitem, rdllink);
+ lockdep_assert_held(&ep->mtx);
+
+ list_for_each_entry_safe(epi, tmp, head, rdllink) {
+ if (esed->res >= esed->maxevents)
+ break;
/*
* Activate ep->ws before deactivating epi->ws to prevent
@@ -1660,42 +1662,42 @@ static __poll_t ep_send_events_proc(struct eventpoll *ep, struct list_head *head
list_del_init(&epi->rdllink);
- revents = ep_item_poll(epi, &pt, 1);
-
/*
* If the event mask intersect the caller-requested one,
* deliver the event to userspace. Again, ep_scan_ready_list()
- * is holding "mtx", so no operations coming from userspace
+ * is holding ep->mtx, so no operations coming from userspace
* can change the item.
*/
- if (revents) {
- if (__put_user(revents, &uevent->events) ||
- __put_user(epi->event.data, &uevent->data)) {
- list_add(&epi->rdllink, head);
- ep_pm_stay_awake(epi);
- if (!esed->res)
- esed->res = -EFAULT;
- return 0;
- }
- esed->res++;
- uevent++;
- if (epi->event.events & EPOLLONESHOT)
- epi->event.events &= EP_PRIVATE_BITS;
- else if (!(epi->event.events & EPOLLET)) {
- /*
- * If this file has been added with Level
- * Trigger mode, we need to insert back inside
- * the ready list, so that the next call to
- * epoll_wait() will check again the events
- * availability. At this point, no one can insert
- * into ep->rdllist besides us. The epoll_ctl()
- * callers are locked out by
- * ep_scan_ready_list() holding "mtx" and the
- * poll callback will queue them in ep->ovflist.
- */
- list_add_tail(&epi->rdllink, &ep->rdllist);
- ep_pm_stay_awake(epi);
- }
+ revents = ep_item_poll(epi, &pt, 1);
+ if (!revents)
+ continue;
+
+ if (__put_user(revents, &uevent->events) ||
+ __put_user(epi->event.data, &uevent->data)) {
+ list_add(&epi->rdllink, head);
+ ep_pm_stay_awake(epi);
+ if (!esed->res)
+ esed->res = -EFAULT;
+ return 0;
+ }
+ esed->res++;
+ uevent++;
+ if (epi->event.events & EPOLLONESHOT)
+ epi->event.events &= EP_PRIVATE_BITS;
+ else if (!(epi->event.events & EPOLLET)) {
+ /*
+ * If this file has been added with Level
+ * Trigger mode, we need to insert back inside
+ * the ready list, so that the next call to
+ * epoll_wait() will check again the events
+ * availability. At this point, no one can insert
+ * into ep->rdllist besides us. The epoll_ctl()
+ * callers are locked out by
+ * ep_scan_ready_list() holding "mtx" and the
+ * poll callback will queue them in ep->ovflist.
+ */
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake(epi);
}
}
@@ -1747,6 +1749,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
+ bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@@ -1761,11 +1764,18 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
} else if (timeout == 0) {
/*
* Avoid the unnecessary trip to the wait queue loop, if the
- * caller specified a non blocking operation.
+ * caller specified a non blocking operation. We still need
+ * lock because we could race and not see an epi being added
+ * to the ready list while in irq callback. Thus incorrectly
+ * returning 0 back to userspace.
*/
timed_out = 1;
+
spin_lock_irq(&ep->wq.lock);
- goto check_events;
+ eavail = ep_events_available(ep);
+ spin_unlock_irq(&ep->wq.lock);
+
+ goto send_events;
}
fetch_events:
@@ -1773,64 +1783,66 @@ fetch_events:
if (!ep_events_available(ep))
ep_busy_loop(ep, timed_out);
- spin_lock_irq(&ep->wq.lock);
+ eavail = ep_events_available(ep);
+ if (eavail)
+ goto send_events;
- if (!ep_events_available(ep)) {
- /*
- * Busy poll timed out. Drop NAPI ID for now, we can add
- * it back in when we have moved a socket with a valid NAPI
- * ID onto the ready list.
- */
- ep_reset_busy_poll_napi_id(ep);
+ /*
+ * Busy poll timed out. Drop NAPI ID for now, we can add
+ * it back in when we have moved a socket with a valid NAPI
+ * ID onto the ready list.
+ */
+ ep_reset_busy_poll_napi_id(ep);
- /*
- * We don't have any available event to return to the caller.
- * We need to sleep here, and we will be wake up by
- * ep_poll_callback() when events will become available.
- */
+ /*
+ * We don't have any available event to return to the caller. We need
+ * to sleep here, and we will be woken by ep_poll_callback() when events
+ * become available.
+ */
+ if (!waiter) {
+ waiter = true;
init_waitqueue_entry(&wait, current);
- __add_wait_queue_exclusive(&ep->wq, &wait);
- for (;;) {
- /*
- * We don't want to sleep if the ep_poll_callback() sends us
- * a wakeup in between. That's why we set the task state
- * to TASK_INTERRUPTIBLE before doing the checks.
- */
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
- */
- if (fatal_signal_pending(current)) {
- res = -EINTR;
- break;
- }
- if (ep_events_available(ep) || timed_out)
- break;
- if (signal_pending(current)) {
- res = -EINTR;
- break;
- }
+ spin_lock_irq(&ep->wq.lock);
+ __add_wait_queue_exclusive(&ep->wq, &wait);
+ spin_unlock_irq(&ep->wq.lock);
+ }
- spin_unlock_irq(&ep->wq.lock);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
- timed_out = 1;
+ for (;;) {
+ /*
+ * We don't want to sleep if the ep_poll_callback() sends us
+ * a wakeup in between. That's why we set the task state
+ * to TASK_INTERRUPTIBLE before doing the checks.
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ /*
+ * Always short-circuit for fatal signals to allow
+ * threads to make a timely exit without the chance of
+ * finding more events available and fetching
+ * repeatedly.
+ */
+ if (fatal_signal_pending(current)) {
+ res = -EINTR;
+ break;
+ }
- spin_lock_irq(&ep->wq.lock);
+ eavail = ep_events_available(ep);
+ if (eavail)
+ break;
+ if (signal_pending(current)) {
+ res = -EINTR;
+ break;
}
- __remove_wait_queue(&ep->wq, &wait);
- __set_current_state(TASK_RUNNING);
+ if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+ timed_out = 1;
+ break;
+ }
}
-check_events:
- /* Is it worth to try to dig for events ? */
- eavail = ep_events_available(ep);
- spin_unlock_irq(&ep->wq.lock);
+ __set_current_state(TASK_RUNNING);
+send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
@@ -1840,6 +1852,12 @@ check_events:
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
+ if (waiter) {
+ spin_lock_irq(&ep->wq.lock);
+ __remove_wait_queue(&ep->wq, &wait);
+ spin_unlock_irq(&ep->wq.lock);
+ }
+
return res;
}
@@ -1876,7 +1894,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
ep_tovisit = epi->ffd.file->private_data;
if (ep_tovisit->visited)
continue;
- error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ error = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, epi->ffd.file,
ep_tovisit, current);
if (error != 0)
@@ -1916,7 +1934,7 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
int ret;
struct eventpoll *ep_cur, *ep_next;
- ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+ ret = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, file, ep, current);
/* clear visited list */
list_for_each_entry_safe(ep_cur, ep_next, &visited_list,