summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c190
1 files changed, 140 insertions, 50 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a3b97f103966..f973a025569b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1314,7 +1314,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.max);
- if (count <= limit)
+ if (count < limit)
margin = min(margin, limit - count);
else
margin = 0;
@@ -1451,6 +1451,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
memcg_page_state(memcg, WORKINGSET_REFAULT));
seq_buf_printf(&s, "workingset_activate %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_restore %lu\n",
+ memcg_page_state(memcg, WORKINGSET_RESTORE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
@@ -2250,7 +2252,8 @@ static void reclaim_high(struct mem_cgroup *memcg,
gfp_t gfp_mask)
{
do {
- if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
+ if (page_counter_read(&memcg->memory) <=
+ READ_ONCE(memcg->memory.high))
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
@@ -2319,41 +2322,64 @@ static void high_work_func(struct work_struct *work)
#define MEMCG_DELAY_PRECISION_SHIFT 20
#define MEMCG_DELAY_SCALING_SHIFT 14
-/*
- * Get the number of jiffies that we should penalise a mischievous cgroup which
- * is exceeding its memory.high by checking both it and its ancestors.
- */
-static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
- unsigned int nr_pages)
+static u64 calculate_overage(unsigned long usage, unsigned long high)
{
- unsigned long penalty_jiffies;
- u64 max_overage = 0;
+ u64 overage;
- do {
- unsigned long usage, high;
- u64 overage;
+ if (usage <= high)
+ return 0;
- usage = page_counter_read(&memcg->memory);
- high = READ_ONCE(memcg->high);
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
- if (usage <= high)
- continue;
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ return div64_u64(overage, high);
+}
- /*
- * Prevent division by 0 in overage calculation by acting as if
- * it was a threshold of 1 page
- */
- high = max(high, 1UL);
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->memory),
+ READ_ONCE(memcg->memory.high));
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
- overage = usage - high;
- overage <<= MEMCG_DELAY_PRECISION_SHIFT;
- overage = div64_u64(overage, high);
+ return max_overage;
+}
- if (overage > max_overage)
- max_overage = overage;
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->swap),
+ READ_ONCE(memcg->swap.high));
+ if (overage)
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
+ max_overage = max(overage, max_overage);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
+ return max_overage;
+}
+
+/*
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
+ */
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ u64 max_overage)
+{
+ unsigned long penalty_jiffies;
+
if (!max_overage)
return 0;
@@ -2377,14 +2403,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
* MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
* larger the current charge patch is than that.
*/
- penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
-
- /*
- * Clamp the max delay per usermode return so as to still keep the
- * application moving forwards and also permit diagnostics, albeit
- * extremely slowly.
- */
- return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
}
/*
@@ -2409,7 +2428,18 @@ void mem_cgroup_handle_over_high(void)
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
*/
- penalty_jiffies = calculate_high_delay(memcg, nr_pages);
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
+ mem_find_max_overage(memcg));
+
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+ swap_find_max_overage(memcg));
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -2594,12 +2624,32 @@ done_restock:
* reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
- /* Don't bother a random interrupted task */
- if (in_interrupt()) {
+ bool mem_high, swap_high;
+
+ mem_high = page_counter_read(&memcg->memory) >
+ READ_ONCE(memcg->memory.high);
+ swap_high = page_counter_read(&memcg->swap) >
+ READ_ONCE(memcg->swap.high);
+
+ /* Don't bother a random interrupted task */
+ if (in_interrupt()) {
+ if (mem_high) {
schedule_work(&memcg->high_work);
break;
}
+ continue;
+ }
+
+ if (mem_high || swap_high) {
+ /*
+ * The allocating tasks in this cgroup will need to do
+ * reclaim or be throttled to prevent further growth
+ * of the memory or swap footprints.
+ *
+ * Target some best-effort fairness between the tasks,
+ * and distribute reclaim work and delay penalties
+ * based on how much each task is actually allocating.
+ */
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
@@ -2802,7 +2852,12 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
static inline bool memcg_kmem_bypass(void)
{
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ if (in_interrupt())
+ return true;
+
+ /* Allow remote memcg charging in kthread contexts. */
+ if ((!current->mm || (current->flags & PF_KTHREAD)) &&
+ !current->active_memcg)
return true;
return false;
}
@@ -4330,7 +4385,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
- /* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
@@ -4338,7 +4392,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
- READ_ONCE(memcg->high));
+ READ_ONCE(memcg->memory.high));
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -5063,8 +5117,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(memcg))
return ERR_CAST(memcg);
- WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
@@ -5216,8 +5271,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
- WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg);
}
@@ -6015,7 +6071,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
- return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -6032,7 +6089,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
- WRITE_ONCE(memcg->high, high);
+ page_counter_set_high(&memcg->memory, high);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -6227,7 +6284,6 @@ static struct cftype memory_files[] = {
},
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
{
@@ -7131,10 +7187,13 @@ bool mem_cgroup_swap_full(struct page *page)
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >=
- READ_ONCE(memcg->swap.max))
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ unsigned long usage = page_counter_read(&memcg->swap);
+
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
+ usage * 2 >= READ_ONCE(memcg->swap.max))
return true;
+ }
return false;
}
@@ -7164,6 +7223,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static int swap_high_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
+}
+
+static ssize_t swap_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &high);
+ if (err)
+ return err;
+
+ page_counter_set_high(&memcg->swap, high);
+
+ return nbytes;
+}
+
static int swap_max_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -7191,6 +7273,8 @@ static int swap_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ seq_printf(m, "high %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
seq_printf(m, "fail %lu\n",
@@ -7206,6 +7290,12 @@ static struct cftype swap_files[] = {
.read_u64 = swap_current_read,
},
{
+ .name = "swap.high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_high_show,
+ .write = swap_high_write,
+ },
+ {
.name = "swap.max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = swap_max_show,