summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt23
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/btf.c35
-rw-r--r--kernel/bpf/cgroup.c27
-rw-r--r--kernel/bpf/core.c13
-rw-r--r--kernel/bpf/devmap.c339
-rw-r--r--kernel/bpf/syscall.c51
-rw-r--r--kernel/bpf/sysfs_btf.c46
-rw-r--r--kernel/bpf/verifier.c56
-rw-r--r--kernel/bpf/xskmap.c133
-rw-r--r--kernel/cgroup/cgroup-v1.c27
-rw-r--r--kernel/cgroup/cgroup.c66
-rw-r--r--kernel/cgroup/cpuset.c224
-rw-r--r--kernel/configs.c16
-rw-r--r--kernel/cpu.c35
-rw-r--r--kernel/cred.c21
-rw-r--r--kernel/debug/debug_core.c5
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/dma/Kconfig11
-rw-r--r--kernel/dma/coherent.c13
-rw-r--r--kernel/dma/contiguous.c54
-rw-r--r--kernel/dma/direct.c113
-rw-r--r--kernel/dma/mapping.c151
-rw-r--r--kernel/dma/remap.c69
-rw-r--r--kernel/dma/swiotlb.c92
-rw-r--r--kernel/events/core.c199
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/exit.c44
-rw-r--r--kernel/extable.c11
-rw-r--r--kernel/fail_function.c23
-rw-r--r--kernel/fork.c75
-rw-r--r--kernel/futex.c12
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/gcov/fs.c24
-rwxr-xr-xkernel/gen_kheaders.sh51
-rw-r--r--kernel/iomem.c2
-rw-r--r--kernel/irq/affinity.c237
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/irqdomain.c10
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/irq/pm.c20
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/jump_label.c4
-rw-r--r--kernel/kallsyms.c6
-rw-r--r--kernel/kexec_elf.c430
-rw-r--r--kernel/kprobes.c97
-rw-r--r--kernel/locking/lockdep.c174
-rw-r--r--kernel/locking/lockdep_internals.h9
-rw-r--r--kernel/locking/lockdep_proc.c16
-rw-r--r--kernel/locking/mutex.c39
-rw-r--r--kernel/locking/rtmutex.c8
-rw-r--r--kernel/locking/rwsem.c82
-rw-r--r--kernel/memremap.c374
-rw-r--r--kernel/module.c140
-rw-r--r--kernel/padata.c311
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid.c9
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/power/Kconfig6
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/main.c99
-rw-r--r--kernel/power/qos.c48
-rw-r--r--kernel/power/suspend.c65
-rw-r--r--kernel/power/wakelock.c32
-rw-r--r--kernel/printk/braille.c15
-rw-r--r--kernel/printk/printk.c24
-rw-r--r--kernel/ptrace.c101
-rw-r--r--kernel/rcu/Kconfig8
-rw-r--r--kernel/rcu/Kconfig.debug11
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/rcu_segcblist.c174
-rw-r--r--kernel/rcu/rcu_segcblist.h54
-rw-r--r--kernel/rcu/rcuperf.c10
-rw-r--r--kernel/rcu/rcutorture.c30
-rw-r--r--kernel/rcu/srcutree.c5
-rw-r--r--kernel/rcu/tree.c217
-rw-r--r--kernel/rcu/tree.h81
-rw-r--r--kernel/rcu/tree_exp.h8
-rw-r--r--kernel/rcu/tree_plugin.h1195
-rw-r--r--kernel/rcu/tree_stall.h15
-rw-r--r--kernel/rcu/update.c105
-rw-r--r--kernel/resource.c105
-rw-r--r--kernel/sched/core.c730
-rw-r--r--kernel/sched/cpufreq_schedutil.c27
-rw-r--r--kernel/sched/deadline.c150
-rw-r--r--kernel/sched/fair.c556
-rw-r--r--kernel/sched/idle.c43
-rw-r--r--kernel/sched/isolation.c18
-rw-r--r--kernel/sched/psi.c14
-rw-r--r--kernel/sched/rt.c87
-rw-r--r--kernel/sched/sched.h63
-rw-r--r--kernel/sched/stats.h7
-rw-r--r--kernel/sched/stop_task.c22
-rw-r--r--kernel/sched/topology.c53
-rw-r--r--kernel/signal.c84
-rw-r--r--kernel/smp.c16
-rw-r--r--kernel/stacktrace.c9
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c48
-rw-r--r--kernel/sysctl.c215
-rw-r--r--kernel/time/alarmtimer.c22
-rw-r--r--kernel/time/hrtimer.c235
-rw-r--r--kernel/time/itimer.c12
-rw-r--r--kernel/time/posix-cpu-timers.c1010
-rw-r--r--kernel/time/posix-timers.c61
-rw-r--r--kernel/time/posix-timers.h1
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c13
-rw-r--r--kernel/time/tick-sched.c17
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timer.c105
-rw-r--r--kernel/time/vsyscall.c22
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig21
-rw-r--r--kernel/trace/blktrace.c6
-rw-r--r--kernel/trace/fgraph.c4
-rw-r--r--kernel/trace/ftrace.c73
-rw-r--r--kernel/trace/ftrace_internal.h8
-rw-r--r--kernel/trace/ring_buffer.c17
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c63
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_dynevent.c10
-rw-r--r--kernel/trace/trace_dynevent.h7
-rw-r--r--kernel/trace/trace_event_perf.c3
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_events_filter.c3
-rw-r--r--kernel/trace/trace_events_hist.c25
-rw-r--r--kernel/trace/trace_functions_graph.c17
-rw-r--r--kernel/trace/trace_kprobe.c603
-rw-r--r--kernel/trace/trace_output.c15
-rw-r--r--kernel/trace/trace_probe.c283
-rw-r--r--kernel/trace/trace_probe.h125
-rw-r--r--kernel/trace/trace_probe_tmpl.h36
-rw-r--r--kernel/trace/trace_sched_wakeup.c3
-rw-r--r--kernel/trace/trace_stack.c112
-rw-r--r--kernel/trace/trace_uprobe.c455
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/ucount.c6
-rw-r--r--kernel/watchdog.c4
-rw-r--r--kernel/workqueue.c25
142 files changed, 7926 insertions, 4164 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index dc0b682ec2d9..deff97217496 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -38,7 +38,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_COUNT
+ select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
@@ -55,7 +55,28 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.
+config PREEMPT_RT
+ bool "Fully Preemptible Kernel (Real-Time)"
+ depends on EXPERT && ARCH_SUPPORTS_RT
+ select PREEMPTION
+ help
+ This option turns the kernel into a real-time kernel by replacing
+ various locking primitives (spinlocks, rwlocks, etc.) with
+ preemptible priority-inheritance aware variants, enforcing
+ interrupt threading and introducing mechanisms to break up long
+ non-preemptible sections. This makes the kernel, except for very
+ low level and critical code pathes (entry code, scheduler, low
+ level interrupt handling) fully preemptible and brings most
+ execution contexts under scheduler control.
+
+ Select this if you are building a kernel for systems which
+ require real-time guarantees.
+
endchoice
config PREEMPT_COUNT
bool
+
+config PREEMPTION
+ bool
+ select PREEMPT_COUNT
diff --git a/kernel/Makefile b/kernel/Makefile
index a8d923b5481b..25f9d83d1bbf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -64,6 +64,7 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
+obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup/
@@ -111,7 +112,6 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
obj-$(CONFIG_HAS_IOMEM) += iomem.o
-obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_RSEQ) += rseq.o
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
@@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
+ cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@
$(obj)/kheaders_data.tar.xz: FORCE
$(call cmd,genikh)
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 29d781061cd5..e1d9adb212f9 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -22,3 +22,6 @@ obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
endif
+ifeq ($(CONFIG_SYSFS),y)
+obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
+endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 546ebee39e2a..adb3adcebe3c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -195,8 +195,8 @@
i < btf_type_vlen(struct_type); \
i++, member++)
-static DEFINE_IDR(btf_idr);
-static DEFINE_SPINLOCK(btf_idr_lock);
+DEFINE_IDR(btf_idr);
+DEFINE_SPINLOCK(btf_idr_lock);
struct btf {
void *data;
@@ -1073,11 +1073,18 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
!btf_type_is_var(size_type)))
return NULL;
- size = btf->resolved_sizes[size_type_id];
size_type_id = btf->resolved_ids[size_type_id];
size_type = btf_type_by_id(btf, size_type_id);
if (btf_type_nosize_or_null(size_type))
return NULL;
+ else if (btf_type_has_size(size_type))
+ size = size_type->size;
+ else if (btf_type_is_array(size_type))
+ size = btf->resolved_sizes[size_type_id];
+ else if (btf_type_is_ptr(size_type))
+ size = sizeof(void *);
+ else
+ return NULL;
}
*type_id = size_type_id;
@@ -1602,7 +1609,6 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
const struct btf_type *next_type;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
- u32 next_type_size = 0;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || btf_type_is_resolve_source_only(next_type)) {
@@ -1620,7 +1626,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
* save us a few type-following when we use it later (e.g. in
* pretty print).
*/
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
if (env_type_is_resolved(env, next_type_id))
next_type = btf_type_id_resolve(btf, &next_type_id);
@@ -1633,7 +1639,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
}
}
- env_stack_pop_resolved(env, next_type_id, next_type_size);
+ env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
@@ -1645,7 +1651,6 @@ static int btf_var_resolve(struct btf_verifier_env *env,
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
- u32 next_type_size;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || btf_type_is_resolve_source_only(next_type)) {
@@ -1675,12 +1680,12 @@ static int btf_var_resolve(struct btf_verifier_env *env,
* forward types or similar that would resolve to size of
* zero is allowed.
*/
- if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ if (!btf_type_id_size(btf, &next_type_id, NULL)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
- env_stack_pop_resolved(env, next_type_id, next_type_size);
+ env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
@@ -3371,6 +3376,15 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
}
+#ifdef CONFIG_PROC_FS
+static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct btf *btf = filp->private_data;
+
+ seq_printf(m, "btf_id:\t%u\n", btf->id);
+}
+#endif
+
static int btf_release(struct inode *inode, struct file *filp)
{
btf_put(filp->private_data);
@@ -3378,6 +3392,9 @@ static int btf_release(struct inode *inode, struct file *filp)
}
const struct file_operations btf_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_btf_show_fdinfo,
+#endif
.release = btf_release,
};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 0a00eaca6fae..ddd8addcdb5c 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -964,7 +964,6 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
return -ENOMEM;
ctx->optval_end = ctx->optval + max_optlen;
- ctx->optlen = max_optlen;
return 0;
}
@@ -984,7 +983,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
.level = *level,
.optname = *optname,
};
- int ret;
+ int ret, max_optlen;
/* Opportunistic check to see whether we have any BPF program
* attached to the hook so we don't waste time allocating
@@ -994,10 +993,18 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
return 0;
- ret = sockopt_alloc_buf(&ctx, *optlen);
+ /* Allocate a bit more than the initial user buffer for
+ * BPF program. The canonical use case is overriding
+ * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
+ */
+ max_optlen = max_t(int, 16, *optlen);
+
+ ret = sockopt_alloc_buf(&ctx, max_optlen);
if (ret)
return ret;
+ ctx.optlen = *optlen;
+
if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
ret = -EFAULT;
goto out;
@@ -1016,7 +1023,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
ret = 1;
- } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+ } else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
/* optlen is out of bounds */
ret = -EFAULT;
} else {
@@ -1063,6 +1070,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
if (ret)
return ret;
+ ctx.optlen = max_optlen;
+
if (!retval) {
/* If kernel getsockopt finished successfully,
* copy whatever was returned to the user back
@@ -1325,6 +1334,7 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
+ u32 read_size;
switch (si->off) {
case offsetof(struct bpf_sysctl, write):
@@ -1356,7 +1366,9 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, ppos));
*insn++ = BPF_STX_MEM(
- BPF_SIZEOF(u32), treg, si->src_reg, 0);
+ BPF_SIZEOF(u32), treg, si->src_reg,
+ bpf_ctx_narrow_access_offset(
+ 0, sizeof(u32), sizeof(loff_t)));
*insn++ = BPF_LDX_MEM(
BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -1365,8 +1377,11 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
si->dst_reg, si->src_reg,
offsetof(struct bpf_sysctl_kern, ppos));
+ read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
*insn++ = BPF_LDX_MEM(
- BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
+ BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
+ bpf_ctx_narrow_access_offset(
+ 0, read_size, sizeof(loff_t)));
}
*target_size = sizeof(u32);
break;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 16079550db6d..66088a9e9b9e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
- struct bpf_insn *to_buff)
+ struct bpf_insn *to_buff,
+ bool emit_zext)
{
struct bpf_insn *to = to_buff;
u32 imm_rnd = get_random_int();
@@ -1005,6 +1006,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ if (emit_zext)
+ *to++ = BPF_ZEXT_REG(BPF_REG_AX);
*to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
break;
@@ -1088,7 +1091,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
insn[1].code == 0)
memcpy(aux, insn, sizeof(aux));
- rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+ rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
+ clone->aux->verifier_zext);
if (!rewritten)
continue;
@@ -1295,11 +1299,11 @@ bool bpf_opcode_in_insntable(u8 code)
*
* Decode and execute eBPF instructions.
*/
-static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
- static const void *jumptable[256] = {
+ static const void * const jumptable[256] __annotate_jump_table = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL),
@@ -1558,7 +1562,6 @@ out:
BUG_ON(1);
return 0;
}
-STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
#define DEFINE_BPF_PROG_RUN(stack_size) \
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index d83cf8ccc872..d27f3b60ff6d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -37,6 +37,12 @@
* notifier hook walks the map we know that new dev references can not be
* added by the user because core infrastructure ensures dev_get_by_index()
* calls will fail at this point.
+ *
+ * The devmap_hash type is a map type which interprets keys as ifindexes and
+ * indexes these using a hashmap. This allows maps that use ifindex as key to be
+ * densely packed instead of having holes in the lookup array for unused
+ * ifindexes. The setup and packet enqueue/send code is shared between the two
+ * types of devmap; only the lookup and insertion is different.
*/
#include <linux/bpf.h>
#include <net/xdp.h>
@@ -59,10 +65,11 @@ struct xdp_bulk_queue {
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
+ struct hlist_node index_hlist;
struct bpf_dtab *dtab;
- unsigned int bit;
struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
+ unsigned int idx; /* keep track of map index for tracepoint */
};
struct bpf_dtab {
@@ -70,33 +77,45 @@ struct bpf_dtab {
struct bpf_dtab_netdev **netdev_map;
struct list_head __percpu *flush_list;
struct list_head list;
+
+ /* these are only used for DEVMAP_HASH type maps */
+ struct hlist_head *dev_index_head;
+ spinlock_t index_lock;
+ unsigned int items;
+ u32 n_buckets;
};
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+static struct hlist_head *dev_map_create_hash(unsigned int entries)
+{
+ int i;
+ struct hlist_head *hash;
+
+ hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
+ if (hash != NULL)
+ for (i = 0; i < entries; i++)
+ INIT_HLIST_HEAD(&hash[i]);
+
+ return hash;
+}
+
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
- struct bpf_dtab *dtab;
int err, cpu;
u64 cost;
- if (!capable(CAP_NET_ADMIN))
- return ERR_PTR(-EPERM);
-
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
* verifier prevents writes from the BPF side
*/
attr->map_flags |= BPF_F_RDONLY_PROG;
- dtab = kzalloc(sizeof(*dtab), GFP_USER);
- if (!dtab)
- return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&dtab->map, attr);
@@ -104,12 +123,18 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
cost += sizeof(struct list_head) * num_possible_cpus();
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
+
+ if (!dtab->n_buckets) /* Overflow check */
+ return -EINVAL;
+ cost += sizeof(struct hlist_head) * dtab->n_buckets;
+ }
+
/* if map size is larger than memlock limit, reject it */
err = bpf_map_charge_init(&dtab->map.memory, cost);
if (err)
- goto free_dtab;
-
- err = -ENOMEM;
+ return -EINVAL;
dtab->flush_list = alloc_percpu(struct list_head);
if (!dtab->flush_list)
@@ -124,19 +149,48 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!dtab->netdev_map)
goto free_percpu;
- spin_lock(&dev_map_lock);
- list_add_tail_rcu(&dtab->list, &dev_map_list);
- spin_unlock(&dev_map_lock);
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
+ if (!dtab->dev_index_head)
+ goto free_map_area;
- return &dtab->map;
+ spin_lock_init(&dtab->index_lock);
+ }
+ return 0;
+
+free_map_area:
+ bpf_map_area_free(dtab->netdev_map);
free_percpu:
free_percpu(dtab->flush_list);
free_charge:
bpf_map_charge_finish(&dtab->map.memory);
-free_dtab:
- kfree(dtab);
- return ERR_PTR(err);
+ return -ENOMEM;
+}
+
+static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_dtab *dtab;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ if (!dtab)
+ return ERR_PTR(-ENOMEM);
+
+ err = dev_map_init_map(dtab, attr);
+ if (err) {
+ kfree(dtab);
+ return ERR_PTR(err);
+ }
+
+ spin_lock(&dev_map_lock);
+ list_add_tail_rcu(&dtab->list, &dev_map_list);
+ spin_unlock(&dev_map_lock);
+
+ return &dtab->map;
}
static void dev_map_free(struct bpf_map *map)
@@ -188,6 +242,7 @@ static void dev_map_free(struct bpf_map *map)
free_percpu(dtab->flush_list);
bpf_map_area_free(dtab->netdev_map);
+ kfree(dtab->dev_index_head);
kfree(dtab);
}
@@ -208,6 +263,70 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
+ int idx)
+{
+ return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
+}
+
+struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct hlist_head *head = dev_map_index_hash(dtab, key);
+ struct bpf_dtab_netdev *dev;
+
+ hlist_for_each_entry_rcu(dev, head, index_hlist)
+ if (dev->idx == key)
+ return dev;
+
+ return NULL;
+}
+
+static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u32 idx, *next = next_key;
+ struct bpf_dtab_netdev *dev, *next_dev;
+ struct hlist_head *head;
+ int i = 0;
+
+ if (!key)
+ goto find_first;
+
+ idx = *(u32 *)key;
+
+ dev = __dev_map_hash_lookup_elem(map, idx);
+ if (!dev)
+ goto find_first;
+
+ next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
+ struct bpf_dtab_netdev, index_hlist);
+
+ if (next_dev) {
+ *next = next_dev->idx;
+ return 0;
+ }
+
+ i = idx & (dtab->n_buckets - 1);
+ i++;
+
+ find_first:
+ for (; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+
+ next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct bpf_dtab_netdev,
+ index_hlist);
+ if (next_dev) {
+ *next = next_dev->idx;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
bool in_napi_ctx)
{
@@ -235,7 +354,7 @@ static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
out:
bq->count = 0;
- trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+ trace_xdp_devmap_xmit(&obj->dtab->map, obj->idx,
sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node);
@@ -363,6 +482,15 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->ifindex : NULL;
}
+static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
+ *(u32 *)key);
+ struct net_device *dev = obj ? obj->dev : NULL;
+
+ return dev ? &dev->ifindex : NULL;
+}
+
static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
if (dev->dev->netdev_ops->ndo_xdp_xmit) {
@@ -412,17 +540,74 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct net *net = current->nsproxy->net_ns;
+ struct bpf_dtab_netdev *old_dev;
+ int k = *(u32 *)key;
+ unsigned long flags;
+ int ret = -ENOENT;
+
+ spin_lock_irqsave(&dtab->index_lock, flags);
+
+ old_dev = __dev_map_hash_lookup_elem(map, k);
+ if (old_dev) {
+ dtab->items--;
+ hlist_del_init_rcu(&old_dev->index_hlist);
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+ return ret;
+}
+
+static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
+ struct bpf_dtab *dtab,
+ u32 ifindex,
+ unsigned int idx)
+{
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+ struct bpf_dtab_netdev *dev;
+ struct xdp_bulk_queue *bq;
+ int cpu;
+
+ dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node);
+ if (!dev)
+ return ERR_PTR(-ENOMEM);
+
+ dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+ sizeof(void *), gfp);
+ if (!dev->bulkq) {
+ kfree(dev);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for_each_possible_cpu(cpu) {
+ bq = per_cpu_ptr(dev->bulkq, cpu);
+ bq->obj = dev;
+ }
+
+ dev->dev = dev_get_by_index(net, ifindex);
+ if (!dev->dev) {
+ free_percpu(dev->bulkq);
+ kfree(dev);
+ return ERR_PTR(-EINVAL);
+ }
+
+ dev->idx = idx;
+ dev->dtab = dtab;
+
+ return dev;
+}
+
+static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
u32 ifindex = *(u32 *)value;
- struct xdp_bulk_queue *bq;
u32 i = *(u32 *)key;
- int cpu;
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
@@ -434,31 +619,9 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
if (!ifindex) {
dev = NULL;
} else {
- dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
- if (!dev)
- return -ENOMEM;
-
- dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
- sizeof(void *), gfp);
- if (!dev->bulkq) {
- kfree(dev);
- return -ENOMEM;
- }
-
- for_each_possible_cpu(cpu) {
- bq = per_cpu_ptr(dev->bulkq, cpu);
- bq->obj = dev;
- }
-
- dev->dev = dev_get_by_index(net, ifindex);
- if (!dev->dev) {
- free_percpu(dev->bulkq);
- kfree(dev);
- return -EINVAL;
- }
-
- dev->bit = i;
- dev->dtab = dtab;
+ dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
}
/* Use call_rcu() here to ensure rcu critical sections have completed
@@ -472,6 +635,70 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return 0;
}
+static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return __dev_map_update_elem(current->nsproxy->net_ns,
+ map, key, value, map_flags);
+}
+
+static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dev, *old_dev;
+ u32 ifindex = *(u32 *)value;
+ u32 idx = *(u32 *)key;
+ unsigned long flags;
+ int err = -EEXIST;
+
+ if (unlikely(map_flags > BPF_EXIST || !ifindex))
+ return -EINVAL;
+
+ spin_lock_irqsave(&dtab->index_lock, flags);
+
+ old_dev = __dev_map_hash_lookup_elem(map, idx);
+ if (old_dev && (map_flags & BPF_NOEXIST))
+ goto out_err;
+
+ dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out_err;
+ }
+
+ if (old_dev) {
+ hlist_del_rcu(&old_dev->index_hlist);
+ } else {
+ if (dtab->items >= dtab->map.max_entries) {
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+ call_rcu(&dev->rcu, __dev_map_entry_free);
+ return -E2BIG;
+ }
+ dtab->items++;
+ }
+
+ hlist_add_head_rcu(&dev->index_hlist,
+ dev_map_index_hash(dtab, idx));
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+
+ if (old_dev)
+ call_rcu(&old_dev->rcu, __dev_map_entry_free);
+
+ return 0;
+
+out_err:
+ spin_unlock_irqrestore(&dtab->index_lock, flags);
+ return err;
+}
+
+static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return __dev_map_hash_update_elem(current->nsproxy->net_ns,
+ map, key, value, map_flags);
+}
+
const struct bpf_map_ops dev_map_ops = {
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
@@ -482,6 +709,16 @@ const struct bpf_map_ops dev_map_ops = {
.map_check_btf = map_check_no_btf,
};
+const struct bpf_map_ops dev_map_hash_ops = {
+ .map_alloc = dev_map_alloc,
+ .map_free = dev_map_free,
+ .map_get_next_key = dev_map_hash_get_next_key,
+ .map_lookup_elem = dev_map_hash_lookup_elem,
+ .map_update_elem = dev_map_hash_update_elem,
+ .map_delete_elem = dev_map_hash_delete_elem,
+ .map_check_btf = map_check_no_btf,
+};
+
static int dev_map_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5d141f16f6fa..82eabd4e38ad 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -683,8 +683,8 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
}
/* map_idr_lock should have been held */
-static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
- bool uref)
+static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref)
{
int refold;
@@ -704,6 +704,16 @@ static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
return map;
}
+struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+{
+ spin_lock_bh(&map_idr_lock);
+ map = __bpf_map_inc_not_zero(map, uref);
+ spin_unlock_bh(&map_idr_lock);
+
+ return map;
+}
+EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
+
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -1619,6 +1629,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
BPF_F_ANY_ALIGNMENT |
+ BPF_F_TEST_STATE_FREQ |
BPF_F_TEST_RND_HI32))
return -EINVAL;
@@ -1707,20 +1718,26 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (err)
goto free_used_maps;
- err = bpf_prog_new_fd(prog);
- if (err < 0) {
- /* failed to allocate fd.
- * bpf_prog_put() is needed because the above
- * bpf_prog_alloc_id() has published the prog
- * to the userspace and the userspace may
- * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
- */
- bpf_prog_put(prog);
- return err;
- }
-
+ /* Upon success of bpf_prog_alloc_id(), the BPF prog is
+ * effectively publicly exposed. However, retrieving via
+ * bpf_prog_get_fd_by_id() will take another reference,
+ * therefore it cannot be gone underneath us.
+ *
+ * Only for the time /after/ successful bpf_prog_new_fd()
+ * and before returning to userspace, we might just hold
+ * one reference and any parallel close on that fd could
+ * rip everything out. Hence, below notifications must
+ * happen before bpf_prog_new_fd().
+ *
+ * Also, any failure handling from this point onwards must
+ * be using bpf_prog_put() given the program is exposed.
+ */
bpf_prog_kallsyms_add(prog);
perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0);
+
+ err = bpf_prog_new_fd(prog);
+ if (err < 0)
+ bpf_prog_put(prog);
return err;
free_used_maps:
@@ -2177,7 +2194,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
spin_lock_bh(&map_idr_lock);
map = idr_find(&map_idr, id);
if (map)
- map = bpf_map_inc_not_zero(map, true);
+ map = __bpf_map_inc_not_zero(map, true);
else
map = ERR_PTR(-ENOENT);
spin_unlock_bh(&map_idr_lock);
@@ -2874,6 +2891,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
err = bpf_obj_get_next_id(&attr, uattr,
&map_idr, &map_idr_lock);
break;
+ case BPF_BTF_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &btf_idr, &btf_idr_lock);
+ break;
case BPF_PROG_GET_FD_BY_ID:
err = bpf_prog_get_fd_by_id(&attr);
break;
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
new file mode 100644
index 000000000000..7ae5dddd1fe6
--- /dev/null
+++ b/kernel/bpf/sysfs_btf.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel BTF information for introspection and use by eBPF tools.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/init.h>
+#include <linux/sysfs.h>
+
+/* See scripts/link-vmlinux.sh, gen_btf() func for details */
+extern char __weak _binary__btf_vmlinux_bin_start[];
+extern char __weak _binary__btf_vmlinux_bin_end[];
+
+static ssize_t
+btf_vmlinux_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t len)
+{
+ memcpy(buf, _binary__btf_vmlinux_bin_start + off, len);
+ return len;
+}
+
+static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
+ .attr = { .name = "vmlinux", .mode = 0444, },
+ .read = btf_vmlinux_read,
+};
+
+static struct kobject *btf_kobj;
+
+static int __init btf_vmlinux_init(void)
+{
+ if (!_binary__btf_vmlinux_bin_start)
+ return 0;
+
+ btf_kobj = kobject_create_and_add("btf", kernel_kobj);
+ if (!btf_kobj)
+ return -ENOMEM;
+
+ bin_attr_btf_vmlinux.size = _binary__btf_vmlinux_bin_end -
+ _binary__btf_vmlinux_bin_start;
+
+ return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux);
+}
+
+subsys_initcall(btf_vmlinux_init);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a2e763703c30..ffc3e53f5300 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -985,9 +985,6 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
reg->smax_value = S64_MAX;
reg->umin_value = 0;
reg->umax_value = U64_MAX;
-
- /* constant backtracking is enabled for root only for now */
- reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
}
/* Mark a register as having a completely unknown (scalar) value. */
@@ -1014,7 +1011,11 @@ static void mark_reg_unknown(struct bpf_verifier_env *env,
__mark_reg_not_init(regs + regno);
return;
}
- __mark_reg_unknown(regs + regno);
+ regs += regno;
+ __mark_reg_unknown(regs);
+ /* constant backtracking is enabled for root without bpf2bpf calls */
+ regs->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ?
+ true : false;
}
static void __mark_reg_not_init(struct bpf_reg_state *reg)
@@ -1519,9 +1520,9 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
return -EFAULT;
}
*stack_mask |= 1ull << spi;
- } else if (class == BPF_STX) {
+ } else if (class == BPF_STX || class == BPF_ST) {
if (*reg_mask & dreg)
- /* stx shouldn't be using _scalar_ dst_reg
+ /* stx & st shouldn't be using _scalar_ dst_reg
* to access memory. It means backtracking
* encountered a case of pointer subtraction.
*/
@@ -1540,7 +1541,8 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (!(*stack_mask & (1ull << spi)))
return 0;
*stack_mask &= ~(1ull << spi);
- *reg_mask |= sreg;
+ if (class == BPF_STX)
+ *reg_mask |= sreg;
} else if (class == BPF_JMP || class == BPF_JMP32) {
if (opcode == BPF_CALL) {
if (insn->src_reg == BPF_PSEUDO_CALL)
@@ -1569,10 +1571,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (mode == BPF_IND || mode == BPF_ABS)
/* to be analyzed */
return -ENOTSUPP;
- } else if (class == BPF_ST) {
- if (*reg_mask & dreg)
- /* likely pointer subtraction */
- return -ENOTSUPP;
}
return 0;
}
@@ -1774,16 +1772,21 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
bitmap_from_u64(mask, stack_mask);
for_each_set_bit(i, mask, 64) {
if (i >= func->allocated_stack / BPF_REG_SIZE) {
- /* This can happen if backtracking
- * is propagating stack precision where
- * caller has larger stack frame
- * than callee, but backtrack_insn() should
- * have returned -ENOTSUPP.
+ /* the sequence of instructions:
+ * 2: (bf) r3 = r10
+ * 3: (7b) *(u64 *)(r3 -8) = r0
+ * 4: (79) r4 = *(u64 *)(r10 -8)
+ * doesn't contain jmps. It's backtracked
+ * as a single block.
+ * During backtracking insn 3 is not recognized as
+ * stack access, so at the end of backtracking
+ * stack slot fp-8 is still marked in stack_mask.
+ * However the parent state may not have accessed
+ * fp-8 and it's "unallocated" stack space.
+ * In such case fallback to conservative.
*/
- verbose(env, "BUG spi %d stack_size %d\n",
- i, func->allocated_stack);
- WARN_ONCE(1, "verifier backtracking bug");
- return -EFAULT;
+ mark_all_scalars_precise(env, st);
+ return 0;
}
if (func->stack[i].slot_type[0] != STACK_SPILL) {
@@ -3460,6 +3463,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
goto error;
break;
case BPF_MAP_TYPE_DEVMAP:
+ case BPF_MAP_TYPE_DEVMAP_HASH:
if (func_id != BPF_FUNC_redirect_map &&
func_id != BPF_FUNC_map_lookup_elem)
goto error;
@@ -3542,6 +3546,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_FUNC_redirect_map:
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+ map->map_type != BPF_MAP_TYPE_DEVMAP_HASH &&
map->map_type != BPF_MAP_TYPE_CPUMAP &&
map->map_type != BPF_MAP_TYPE_XSKMAP)
goto error;
@@ -6106,11 +6111,13 @@ static int check_return_code(struct bpf_verifier_env *env)
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
range = tnum_range(1, 1);
+ break;
case BPF_PROG_TYPE_CGROUP_SKB:
if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
range = tnum_range(0, 3);
enforce_attach_type_range = tnum_range(2, 3);
}
+ break;
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
@@ -7221,7 +7228,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
- bool add_new_state = false;
+ bool add_new_state = env->test_state_freq ? true : false;
cur->last_insn_idx = env->prev_insn_idx;
if (!env->insn_aux_data[insn_idx].prune_point)
@@ -8617,8 +8624,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
}
if (is_narrower_load && size < target_size) {
- u8 shift = (off & (size_default - 1)) * 8;
-
+ u8 shift = bpf_ctx_narrow_access_offset(
+ off, size, size_default) * 8;
if (ctx_field_size <= 4) {
if (shift)
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
@@ -9261,6 +9268,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->allow_ptr_leaks = is_priv;
+ if (is_priv)
+ env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
+
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 9bb96ace9fa1..942c662e2eed 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -13,8 +13,71 @@ struct xsk_map {
struct bpf_map map;
struct xdp_sock **xsk_map;
struct list_head __percpu *flush_list;
+ spinlock_t lock; /* Synchronize map updates */
};
+int xsk_map_inc(struct xsk_map *map)
+{
+ struct bpf_map *m = &map->map;
+
+ m = bpf_map_inc(m, false);
+ return PTR_ERR_OR_ZERO(m);
+}
+
+void xsk_map_put(struct xsk_map *map)
+{
+ bpf_map_put(&map->map);
+}
+
+static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map,
+ struct xdp_sock **map_entry)
+{
+ struct xsk_map_node *node;
+ int err;
+
+ node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN);
+ if (!node)
+ return NULL;
+
+ err = xsk_map_inc(map);
+ if (err) {
+ kfree(node);
+ return ERR_PTR(err);
+ }
+
+ node->map = map;
+ node->map_entry = map_entry;
+ return node;
+}
+
+static void xsk_map_node_free(struct xsk_map_node *node)
+{
+ xsk_map_put(node->map);
+ kfree(node);
+}
+
+static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node)
+{
+ spin_lock_bh(&xs->map_list_lock);
+ list_add_tail(&node->node, &xs->map_list);
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
+static void xsk_map_sock_delete(struct xdp_sock *xs,
+ struct xdp_sock **map_entry)
+{
+ struct xsk_map_node *n, *tmp;
+
+ spin_lock_bh(&xs->map_list_lock);
+ list_for_each_entry_safe(n, tmp, &xs->map_list, node) {
+ if (map_entry == n->map_entry) {
+ list_del(&n->node);
+ xsk_map_node_free(n);
+ }
+ }
+ spin_unlock_bh(&xs->map_list_lock);
+}
+
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
struct xsk_map *m;
@@ -34,6 +97,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&m->map, attr);
+ spin_lock_init(&m->lock);
cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
cost += sizeof(struct list_head) * num_possible_cpus();
@@ -71,21 +135,9 @@ free_m:
static void xsk_map_free(struct bpf_map *map)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- int i;
bpf_clear_redirect_map(map);
synchronize_net();
-
- for (i = 0; i < map->max_entries; i++) {
- struct xdp_sock *xs;
-
- xs = m->xsk_map[i];
- if (!xs)
- continue;
-
- sock_put((struct sock *)xs);
- }
-
free_percpu(m->flush_list);
bpf_map_area_free(m->xsk_map);
kfree(m);
@@ -164,8 +216,9 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
+ struct xdp_sock *xs, *old_xs, **map_entry;
u32 i = *(u32 *)key, fd = *(u32 *)value;
- struct xdp_sock *xs, *old_xs;
+ struct xsk_map_node *node;
struct socket *sock;
int err;
@@ -173,8 +226,6 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EINVAL;
if (unlikely(i >= m->map.max_entries))
return -E2BIG;
- if (unlikely(map_flags == BPF_NOEXIST))
- return -EEXIST;
sock = sockfd_lookup(fd, &err);
if (!sock)
@@ -192,32 +243,70 @@ static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EOPNOTSUPP;
}
- sock_hold(sock->sk);
+ map_entry = &m->xsk_map[i];
+ node = xsk_map_node_alloc(m, map_entry);
+ if (IS_ERR(node)) {
+ sockfd_put(sock);
+ return PTR_ERR(node);
+ }
- old_xs = xchg(&m->xsk_map[i], xs);
+ spin_lock_bh(&m->lock);
+ old_xs = READ_ONCE(*map_entry);
+ if (old_xs == xs) {
+ err = 0;
+ goto out;
+ } else if (old_xs && map_flags == BPF_NOEXIST) {
+ err = -EEXIST;
+ goto out;
+ } else if (!old_xs && map_flags == BPF_EXIST) {
+ err = -ENOENT;
+ goto out;
+ }
+ xsk_map_sock_add(xs, node);
+ WRITE_ONCE(*map_entry, xs);
if (old_xs)
- sock_put((struct sock *)old_xs);
-
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
sockfd_put(sock);
return 0;
+
+out:
+ spin_unlock_bh(&m->lock);
+ sockfd_put(sock);
+ xsk_map_node_free(node);
+ return err;
}
static int xsk_map_delete_elem(struct bpf_map *map, void *key)
{
struct xsk_map *m = container_of(map, struct xsk_map, map);
- struct xdp_sock *old_xs;
+ struct xdp_sock *old_xs, **map_entry;
int k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
- old_xs = xchg(&m->xsk_map[k], NULL);
+ spin_lock_bh(&m->lock);
+ map_entry = &m->xsk_map[k];
+ old_xs = xchg(map_entry, NULL);
if (old_xs)
- sock_put((struct sock *)old_xs);
+ xsk_map_sock_delete(old_xs, map_entry);
+ spin_unlock_bh(&m->lock);
return 0;
}
+void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
+ struct xdp_sock **map_entry)
+{
+ spin_lock_bh(&map->lock);
+ if (READ_ONCE(*map_entry) == xs) {
+ WRITE_ONCE(*map_entry, NULL);
+ xsk_map_sock_delete(xs, map_entry);
+ }
+ spin_unlock_bh(&map->lock);
+}
+
const struct bpf_map_ops xsk_map_ops = {
.map_alloc = xsk_map_alloc,
.map_free = xsk_map_free,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 88006be40ea3..7f83f4121d8d 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -194,25 +194,6 @@ struct cgroup_pidlist {
};
/*
- * The following two functions "fix" the issue where there are more pids
- * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
- * TODO: replace with a kernel-wide solution to this problem
- */
-#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
-static void *pidlist_allocate(int count)
-{
- if (PIDLIST_TOO_LARGE(count))
- return vmalloc(array_size(count, sizeof(pid_t)));
- else
- return kmalloc_array(count, sizeof(pid_t), GFP_KERNEL);
-}
-
-static void pidlist_free(void *p)
-{
- kvfree(p);
-}
-
-/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
@@ -244,7 +225,7 @@ static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
*/
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
- pidlist_free(l->list);
+ kvfree(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
@@ -365,7 +346,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
- array = pidlist_allocate(length);
+ array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL);
if (!array)
return -ENOMEM;
/* now, populate the array */
@@ -390,12 +371,12 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
- pidlist_free(array);
+ kvfree(array);
return -ENOMEM;
}
/* store array, freeing old if necessary */
- pidlist_free(l->list);
+ kvfree(l->list);
l->list = array;
l->length = length;
*lp = l;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 300b0c416341..080561bb8a4b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -488,7 +488,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
rcu_read_lock();
css = cgroup_css(cgrp, ss);
- if (!css || !css_tryget_online(css))
+ if (css && !css_tryget_online(css))
css = NULL;
rcu_read_unlock();
@@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc)
*/
static bool use_task_css_set_links __read_mostly;
-static void cgroup_enable_task_cg_lists(void)
+void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
@@ -2201,8 +2201,7 @@ static int cgroup_init_fs_context(struct fs_context *fc)
fc->ops = &cgroup_fs_context_ops;
else
fc->ops = &cgroup1_fs_context_ops;
- if (fc->user_ns)
- put_user_ns(fc->user_ns);
+ put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
return 0;
@@ -2243,6 +2242,50 @@ static struct file_system_type cgroup2_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
+#ifdef CONFIG_CPUSETS
+static const struct fs_context_operations cpuset_fs_context_ops = {
+ .get_tree = cgroup1_get_tree,
+ .free = cgroup_fs_context_free,
+};
+
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
+ * users. If someone tries to mount the "cpuset" filesystem, we
+ * silently switch it to mount "cgroup" instead
+ */
+static int cpuset_init_fs_context(struct fs_context *fc)
+{
+ char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
+ struct cgroup_fs_context *ctx;
+ int err;
+
+ err = cgroup_init_fs_context(fc);
+ if (err) {
+ kfree(agent);
+ return err;
+ }
+
+ fc->ops = &cpuset_fs_context_ops;
+
+ ctx = cgroup_fc2context(fc);
+ ctx->subsys_mask = 1 << cpuset_cgrp_id;
+ ctx->flags |= CGRP_ROOT_NOPREFIX;
+ ctx->release_agent = agent;
+
+ get_filesystem(&cgroup_fs_type);
+ put_filesystem(fc->fs_type);
+ fc->fs_type = &cgroup_fs_type;
+
+ return 0;
+}
+
+static struct file_system_type cpuset_fs_type = {
+ .name = "cpuset",
+ .init_fs_context = cpuset_init_fs_context,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+#endif
+
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns)
{
@@ -2851,7 +2894,7 @@ static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
do_each_subsys_mask(ss, ssid, ss_mask) {
if (printed)
seq_putc(seq, ' ');
- seq_printf(seq, "%s", ss->name);
+ seq_puts(seq, ss->name);
printed = true;
} while_each_subsys_mask();
if (printed)
@@ -5212,8 +5255,16 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
* if the parent has to be frozen, the child has too.
*/
cgrp->freezer.e_freeze = parent->freezer.e_freeze;
- if (cgrp->freezer.e_freeze)
+ if (cgrp->freezer.e_freeze) {
+ /*
+ * Set the CGRP_FREEZE flag, so when a process will be
+ * attached to the child cgroup, it will become frozen.
+ * At this point the new cgroup is unpopulated, so we can
+ * consider it frozen immediately.
+ */
+ set_bit(CGRP_FREEZE, &cgrp->flags);
set_bit(CGRP_FROZEN, &cgrp->flags);
+ }
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
@@ -5761,6 +5812,9 @@ int __init cgroup_init(void)
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
+#ifdef CONFIG_CPUSETS
+ WARN_ON(register_filesystem(&cpuset_fs_type));
+#endif
return 0;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b3b02b9c4405..c52bc91f882b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -45,6 +45,7 @@
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
@@ -332,7 +333,18 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
-static DEFINE_MUTEX(cpuset_mutex);
+DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+
+void cpuset_read_lock(void)
+{
+ percpu_down_read(&cpuset_rwsem);
+}
+
+void cpuset_read_unlock(void)
+{
+ percpu_up_read(&cpuset_rwsem);
+}
+
static DEFINE_SPINLOCK(callback_lock);
static struct workqueue_struct *cpuset_migrate_mm_wq;
@@ -356,59 +368,6 @@ static inline bool is_in_v2_mode(void)
}
/*
- * This is ugly, but preserves the userspace API for existing cpuset
- * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead
- */
-static int cpuset_get_tree(struct fs_context *fc)
-{
- struct file_system_type *cgroup_fs;
- struct fs_context *new_fc;
- int ret;
-
- cgroup_fs = get_fs_type("cgroup");
- if (!cgroup_fs)
- return -ENODEV;
-
- new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags);
- if (IS_ERR(new_fc)) {
- ret = PTR_ERR(new_fc);
- } else {
- static const char agent_path[] = "/sbin/cpuset_release_agent";
- ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0);
- if (!ret)
- ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0);
- if (!ret)
- ret = vfs_parse_fs_string(new_fc, "release_agent",
- agent_path, sizeof(agent_path) - 1);
- if (!ret)
- ret = vfs_get_tree(new_fc);
- if (!ret) { /* steal the result */
- fc->root = new_fc->root;
- new_fc->root = NULL;
- }
- put_fs_context(new_fc);
- }
- put_filesystem(cgroup_fs);
- return ret;
-}
-
-static const struct fs_context_operations cpuset_fs_context_ops = {
- .get_tree = cpuset_get_tree,
-};
-
-static int cpuset_init_fs_context(struct fs_context *fc)
-{
- fc->ops = &cpuset_fs_context_ops;
- return 0;
-}
-
-static struct file_system_type cpuset_fs_type = {
- .name = "cpuset",
- .init_fs_context = cpuset_init_fs_context,
-};
-
-/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
* until we find one that does have some online cpus.
@@ -729,7 +688,7 @@ static inline int nr_cpusets(void)
* load balancing domains (sched domains) as specified by that partial
* partition.
*
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
+ * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
* for a background explanation of this.
*
* Does not return errors, on the theory that the callers of this
@@ -947,6 +906,67 @@ done:
return ndoms;
}
+static void update_tasks_root_domain(struct cpuset *cs)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&cs->css, 0, &it);
+
+ while ((task = css_task_iter_next(&it)))
+ dl_add_task_root_domain(task);
+
+ css_task_iter_end(&it);
+}
+
+static void rebuild_root_domains(void)
+{
+ struct cpuset *cs = NULL;
+ struct cgroup_subsys_state *pos_css;
+
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_cpus_held();
+ lockdep_assert_held(&sched_domains_mutex);
+
+ cgroup_enable_task_cg_lists();
+
+ rcu_read_lock();
+
+ /*
+ * Clear default root domain DL accounting, it will be computed again
+ * if a task belongs to it.
+ */
+ dl_clear_root_domain(&def_root_domain);
+
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+
+ if (cpumask_empty(cs->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+
+ css_get(&cs->css);
+
+ rcu_read_unlock();
+
+ update_tasks_root_domain(cs);
+
+ rcu_read_lock();
+ css_put(&cs->css);
+ }
+ rcu_read_unlock();
+}
+
+static void
+partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ mutex_lock(&sched_domains_mutex);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
+ rebuild_root_domains();
+ mutex_unlock(&sched_domains_mutex);
+}
+
/*
* Rebuild scheduler domains.
*
@@ -964,8 +984,8 @@ static void rebuild_sched_domains_locked(void)
cpumask_var_t *doms;
int ndoms;
- lockdep_assert_held(&cpuset_mutex);
- get_online_cpus();
+ lockdep_assert_cpus_held();
+ percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* We have raced with CPU hotplug. Don't do anything to avoid
@@ -974,19 +994,17 @@ static void rebuild_sched_domains_locked(void)
*/
if (!top_cpuset.nr_subparts_cpus &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
if (top_cpuset.nr_subparts_cpus &&
!cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
- goto out;
+ return;
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
- partition_sched_domains(ndoms, doms, attr);
-out:
- put_online_cpus();
+ partition_and_rebuild_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
static void rebuild_sched_domains_locked(void)
@@ -996,9 +1014,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
rebuild_sched_domains_locked();
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
}
/**
@@ -1104,7 +1124,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
bool part_error = false; /* Partition error? */
- lockdep_assert_held(&cpuset_mutex);
+ percpu_rwsem_assert_held(&cpuset_rwsem);
/*
* The parent must be a partition root.
@@ -2092,7 +2112,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
@@ -2116,7 +2136,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
cs->attach_in_progress++;
ret = 0;
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
return ret;
}
@@ -2126,9 +2146,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
css_cs(css)->attach_in_progress--;
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/*
@@ -2151,7 +2171,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* prepare for attach */
if (cs == &top_cpuset)
@@ -2205,7 +2225,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/* The various types of files and directories in a cpuset file system */
@@ -2236,7 +2256,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -2272,7 +2293,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return retval;
}
@@ -2283,7 +2305,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2296,7 +2319,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return retval;
}
@@ -2335,7 +2359,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2359,7 +2384,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -2490,13 +2516,15 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
return -EINVAL;
css_get(&cs->css);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
retval = update_prstate(cs, val);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
css_put(&cs->css);
return retval ?: nbytes;
}
@@ -2702,7 +2730,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
@@ -2753,7 +2782,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
return 0;
}
@@ -2772,7 +2802,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- mutex_lock(&cpuset_mutex);
+ get_online_cpus();
+ percpu_down_write(&cpuset_rwsem);
if (is_partition_root(cs))
update_prstate(cs, 0);
@@ -2791,7 +2822,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
+ put_online_cpus();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2803,7 +2835,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
@@ -2816,7 +2848,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
spin_unlock_irq(&callback_lock);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/*
@@ -2853,12 +2885,12 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
/**
* cpuset_init - initialize cpusets at system boot
*
- * Description: Initialize top_cpuset and the cpuset internal file system,
+ * Description: Initialize top_cpuset
**/
int __init cpuset_init(void)
{
- int err = 0;
+ BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
@@ -2873,10 +2905,6 @@ int __init cpuset_init(void)
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
top_cpuset.relax_domain_level = -1;
- err = register_filesystem(&cpuset_fs_type);
- if (err < 0)
- return err;
-
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
return 0;
@@ -2935,7 +2963,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
/*
* Move tasks to the nearest ancestor with execution resources,
@@ -2945,7 +2973,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
if (is_empty)
remove_tasks_in_empty_cpuset(cs);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
}
static void
@@ -2995,14 +3023,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
goto retry;
}
@@ -3070,7 +3098,7 @@ update_tasks:
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
}
/**
@@ -3100,7 +3128,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
- mutex_lock(&cpuset_mutex);
+ percpu_down_write(&cpuset_rwsem);
/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3150,7 +3178,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
update_tasks_nodemask(&top_cpuset);
}
- mutex_unlock(&cpuset_mutex);
+ percpu_up_write(&cpuset_rwsem);
/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
diff --git a/kernel/configs.c b/kernel/configs.c
index b062425ccf8d..c09ea4c995e1 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* kernel/configs.c
* Echo the kernel .config file used to build the kernel
@@ -6,21 +7,6 @@
* Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
* Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
* Copyright (C) 2002 Hewlett-Packard Company
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/kernel.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e84c0873559e..e1967e9eddc2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -62,7 +62,6 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
- bool booted_once;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -76,6 +75,10 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
.fail = CPUHP_INVALID,
};
+#ifdef CONFIG_SMP
+cpumask_t cpus_booted_once_mask;
+#endif
+
#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
static struct lockdep_map cpuhp_state_up_map =
STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
@@ -433,7 +436,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu)
* CPU. Otherwise, a broadacasted MCE observing CR4.MCE=0b on any
* core will shutdown the machine.
*/
- return !per_cpu(cpuhp_state, cpu).booted_once;
+ return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}
#else
static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
@@ -1066,7 +1069,7 @@ void notify_cpu_starting(unsigned int cpu)
int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
- st->booted_once = true;
+ cpumask_set_cpu(cpu, &cpus_booted_once_mask);
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
@@ -2295,6 +2298,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);
+atomic_t __num_online_cpus __read_mostly;
+EXPORT_SYMBOL(__num_online_cpus);
+
void init_cpu_present(const struct cpumask *src)
{
cpumask_copy(&__cpu_present_mask, src);
@@ -2310,6 +2316,27 @@ void init_cpu_online(const struct cpumask *src)
cpumask_copy(&__cpu_online_mask, src);
}
+void set_cpu_online(unsigned int cpu, bool online)
+{
+ /*
+ * atomic_inc/dec() is required to handle the horrid abuse of this
+ * function by the reboot and kexec code which invoke it from
+ * IPI/NMI broadcasts when shutting down CPUs. Invocation from
+ * regular CPU hotplug is properly serialized.
+ *
+ * Note, that the fact that __num_online_cpus is of type atomic_t
+ * does not protect readers which are not serialized against
+ * concurrent hotplug operations.
+ */
+ if (online) {
+ if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
+ atomic_inc(&__num_online_cpus);
+ } else {
+ if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
+ atomic_dec(&__num_online_cpus);
+ }
+}
+
/*
* Activate the first processor.
*/
@@ -2334,7 +2361,7 @@ void __init boot_cpu_init(void)
void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
- this_cpu_write(cpuhp_state.booted_once, true);
+ cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
}
diff --git a/kernel/cred.c b/kernel/cred.c
index f9a0ce66c9c3..c0a4c12d38b2 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -144,7 +144,10 @@ void __put_cred(struct cred *cred)
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
- call_rcu(&cred->rcu, put_cred_rcu);
+ if (cred->non_rcu)
+ put_cred_rcu(&cred->rcu);
+ else
+ call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
@@ -261,6 +264,7 @@ struct cred *prepare_creds(void)
old = task->cred;
memcpy(new, old, sizeof(struct cred));
+ new->non_rcu = 0;
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_group_info(new->group_info);
@@ -544,7 +548,19 @@ const struct cred *override_creds(const struct cred *new)
validate_creds(old);
validate_creds(new);
- get_cred(new);
+
+ /*
+ * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
+ *
+ * That means that we do not clear the 'non_rcu' flag, since
+ * we are only installing the cred into the thread-synchronous
+ * '->cred' pointer, not the '->real_cred' pointer that is
+ * visible to other threads under RCU.
+ *
+ * Also note that we did validate_creds() manually, not depending
+ * on the validation in 'get_cred()'.
+ */
+ get_new_cred((struct cred *)new);
alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
alter_cred_subscribers(old, -1);
@@ -681,6 +697,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
validate_creds(old);
*new = *old;
+ new->non_rcu = 0;
atomic_set(&new->usage, 1);
set_cred_subscribers(new, 0);
get_uid(new->user);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 5cc608de6883..10f1187b3907 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -787,11 +787,8 @@ out:
}
/*
- * GDB places a breakpoint at this function to know dynamically
- * loaded objects. It's not defined static so that only one instance with this
- * name exists in the kernel.
+ * GDB places a breakpoint at this function to know dynamically loaded objects.
*/
-
static int module_event(struct notifier_block *self, unsigned long val,
void *data)
{
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 9ecfa37c7fbf..4567fe998c30 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -830,7 +830,7 @@ static void parse_grep(const char *str)
cp++;
while (isspace(*cp))
cp++;
- if (strncmp(cp, "grep ", 5)) {
+ if (!str_has_prefix(cp, "grep ")) {
kdb_printf("invalid 'pipe', see grephelp\n");
return;
}
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 70f8f8d9200e..73c5c2b8e824 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -20,6 +20,15 @@ config ARCH_HAS_DMA_COHERENCE_H
config ARCH_HAS_DMA_SET_MASK
bool
+#
+# Select this option if the architecture needs special handling for
+# DMA_ATTR_WRITE_COMBINE. Normally the "uncached" mapping should be what
+# people thing of when saying write combine, so very few platforms should
+# need to enable this.
+#
+config ARCH_HAS_DMA_WRITE_COMBINE
+ bool
+
config DMA_DECLARE_COHERENT
bool
@@ -45,7 +54,7 @@ config ARCH_HAS_DMA_PREP_COHERENT
config ARCH_HAS_DMA_COHERENT_TO_PFN
bool
-config ARCH_HAS_DMA_MMAP_PGPROT
+config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
config DMA_NONCOHERENT_CACHE_SYNC
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 29fd6590dc1e..545e3869b0e3 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -122,18 +122,6 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_release_coherent_memory(mem);
return ret;
}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
-
- if (!mem)
- return;
- dma_release_coherent_memory(mem);
- dev->dma_mem = NULL;
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
ssize_t size, dma_addr_t *dma_handle)
@@ -288,7 +276,6 @@ int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret);
}
-EXPORT_SYMBOL(dma_mmap_from_dev_coherent);
int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr,
size_t size, int *ret)
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index b2a87905846d..69cfb4345388 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -214,6 +214,60 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages,
return cma_release(dev_get_cma_area(dev), pages, count);
}
+/**
+ * dma_alloc_contiguous() - allocate contiguous pages
+ * @dev: Pointer to device for which the allocation is performed.
+ * @size: Requested allocation size.
+ * @gfp: Allocation flags.
+ *
+ * This function allocates contiguous memory buffer for specified device. It
+ * first tries to use device specific contiguous memory area if available or
+ * the default global one, then tries a fallback allocation of normal pages.
+ *
+ * Note that it byapss one-page size of allocations from the global area as
+ * the addresses within one page are always contiguous, so there is no need
+ * to waste CMA pages for that kind; it also helps reduce fragmentations.
+ */
+struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
+{
+ size_t count = size >> PAGE_SHIFT;
+ struct page *page = NULL;
+ struct cma *cma = NULL;
+
+ if (dev && dev->cma_area)
+ cma = dev->cma_area;
+ else if (count > 1)
+ cma = dma_contiguous_default_area;
+
+ /* CMA can be used only in the context which permits sleeping */
+ if (cma && gfpflags_allow_blocking(gfp)) {
+ size_t align = get_order(size);
+ size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
+
+ page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
+ }
+
+ return page;
+}
+
+/**
+ * dma_free_contiguous() - release allocated pages
+ * @dev: Pointer to device for which the pages were allocated.
+ * @page: Pointer to the allocated pages.
+ * @size: Size of allocated pages.
+ *
+ * This function releases memory allocated by dma_alloc_contiguous(). As the
+ * cma_release returns false when provided pages do not belong to contiguous
+ * area and true otherwise, this function then does a fallback __free_pages()
+ * upon a false-return.
+ */
+void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
+{
+ if (!cma_release(dev_get_cma_area(dev), page,
+ PAGE_ALIGN(size) >> PAGE_SHIFT))
+ __free_pages(page, get_order(size));
+}
+
/*
* Support for reserved memory regions defined in device tree
*/
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 2c2772e9702a..8402b29c280f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -23,14 +23,6 @@
#define ARCH_ZONE_DMA_BITS 24
#endif
-/*
- * For AMD SEV all DMA must be to unencrypted addresses.
- */
-static inline bool force_dma_unencrypted(void)
-{
- return sev_active();
-}
-
static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
{
if (!dev->dma_mask) {
@@ -46,7 +38,7 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
phys_addr_t phys)
{
- if (force_dma_unencrypted())
+ if (force_dma_unencrypted(dev))
return __phys_to_dma(dev, phys);
return phys_to_dma(dev, phys);
}
@@ -55,9 +47,6 @@ u64 dma_direct_get_required_mask(struct device *dev)
{
u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
- if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma)
- max_dma = dev->bus_dma_mask;
-
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
}
@@ -67,7 +56,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
dma_mask = dev->bus_dma_mask;
- if (force_dma_unencrypted())
+ if (force_dma_unencrypted(dev))
*phys_mask = __dma_to_phys(dev, dma_mask);
else
*phys_mask = dma_to_phys(dev, dma_mask);
@@ -96,8 +85,8 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
- unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
- int page_order = get_order(size);
+ size_t alloc_size = PAGE_ALIGN(size);
+ int node = dev_to_node(dev);
struct page *page = NULL;
u64 phys_mask;
@@ -108,21 +97,16 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp &= ~__GFP_ZERO;
gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_mask);
-again:
- /* CMA can be used only in the context which permits sleeping */
- if (gfpflags_allow_blocking(gfp)) {
- page = dma_alloc_from_contiguous(dev, count, page_order,
- gfp & __GFP_NOWARN);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- dma_release_from_contiguous(dev, page, count);
- page = NULL;
- }
+ page = dma_alloc_contiguous(dev, alloc_size, gfp);
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ dma_free_contiguous(dev, page, alloc_size);
+ page = NULL;
}
+again:
if (!page)
- page = alloc_pages_node(dev_to_node(dev), gfp, page_order);
-
+ page = alloc_pages_node(node, gfp, get_order(alloc_size));
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- __free_pages(page, page_order);
+ dma_free_contiguous(dev, page, size);
page = NULL;
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
@@ -151,10 +135,20 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
if (!page)
return NULL;
+ if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+ !force_dma_unencrypted(dev)) {
+ /* remove any dirty cache lines on the kernel alias */
+ if (!PageHighMem(page))
+ arch_dma_prep_coherent(page, size);
+ *dma_handle = phys_to_dma(dev, page_to_phys(page));
+ /* return the page pointer as the opaque cookie */
+ return page;
+ }
+
if (PageHighMem(page)) {
/*
* Depending on the cma= arguments and per-arch setup
- * dma_alloc_from_contiguous could return highmem pages.
+ * dma_alloc_contiguous could return highmem pages.
* Without remapping there is no way to return them here,
* so log an error and fail.
*/
@@ -164,22 +158,26 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
}
ret = page_address(page);
- if (force_dma_unencrypted()) {
+ if (force_dma_unencrypted(dev)) {
set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
*dma_handle = __phys_to_dma(dev, page_to_phys(page));
} else {
*dma_handle = phys_to_dma(dev, page_to_phys(page));
}
memset(ret, 0, size);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ dma_alloc_need_uncached(dev, attrs)) {
+ arch_dma_prep_coherent(page, size);
+ ret = uncached_kernel_address(ret);
+ }
+
return ret;
}
void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
{
- unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
- if (!dma_release_from_contiguous(dev, page, count))
- __free_pages(page, get_order(size));
+ dma_free_contiguous(dev, page, size);
}
void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
@@ -187,15 +185,27 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
{
unsigned int page_order = get_order(size);
- if (force_dma_unencrypted())
+ if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
+ !force_dma_unencrypted(dev)) {
+ /* cpu_addr is a struct page cookie, not a kernel address */
+ __dma_direct_free_pages(dev, size, cpu_addr);
+ return;
+ }
+
+ if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ dma_alloc_need_uncached(dev, attrs))
+ cpu_addr = cached_kernel_address(cpu_addr);
__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
}
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
- if (!dev_is_dma_coherent(dev))
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ dma_alloc_need_uncached(dev, attrs))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
}
@@ -203,7 +213,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
void dma_direct_free(struct device *dev, size_t size,
void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
{
- if (!dev_is_dma_coherent(dev))
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+ dma_alloc_need_uncached(dev, attrs))
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
else
dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs);
@@ -231,12 +242,14 @@ void dma_direct_sync_sg_for_device(struct device *dev,
int i;
for_each_sg(sgl, sg, nents, i) {
- if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
- swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length,
+ phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, sg->length,
dir, SYNC_FOR_DEVICE);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(dev, sg_phys(sg), sg->length,
+ arch_sync_dma_for_device(dev, paddr, sg->length,
dir);
}
}
@@ -268,11 +281,13 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
int i;
for_each_sg(sgl, sg, nents, i) {
+ phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
+
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir);
-
- if (unlikely(is_swiotlb_buffer(sg_phys(sg))))
- swiotlb_tbl_sync_single(dev, sg_phys(sg), sg->length, dir,
+ arch_sync_dma_for_cpu(dev, paddr, sg->length, dir);
+
+ if (unlikely(is_swiotlb_buffer(paddr)))
+ swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
SYNC_FOR_CPU);
}
@@ -290,7 +305,7 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
}
EXPORT_SYMBOL(dma_direct_unmap_page);
@@ -396,11 +411,9 @@ int dma_direct_supported(struct device *dev, u64 mask)
size_t dma_direct_max_mapping_size(struct device *dev)
{
- size_t size = SIZE_MAX;
-
/* If SWIOTLB is active, use its maximum mapping size */
- if (is_swiotlb_active())
- size = swiotlb_max_mapping_size(dev);
-
- return size;
+ if (is_swiotlb_active() &&
+ (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+ return swiotlb_max_mapping_size(dev);
+ return SIZE_MAX;
}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index f7afdadb6770..d9334f31a5af 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -116,11 +116,16 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
int ret;
if (!dev_is_dma_coherent(dev)) {
+ unsigned long pfn;
+
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
return -ENXIO;
- page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr,
- dma_addr));
+ /* If the PFN is not valid, we do not have a struct page */
+ pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
+ if (!pfn_valid(pfn))
+ return -ENXIO;
+ page = pfn_to_page(pfn);
} else {
page = virt_to_page(cpu_addr);
}
@@ -131,20 +136,51 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
return ret;
}
+/*
+ * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
+ * that the intention is to allow exporting memory allocated via the
+ * coherent DMA APIs through the dma_buf API, which only accepts a
+ * scattertable. This presents a couple of problems:
+ * 1. Not all memory allocated via the coherent DMA APIs is backed by
+ * a struct page
+ * 2. Passing coherent DMA memory into the streaming APIs is not allowed
+ * as we will try to flush the memory through a different alias to that
+ * actually being used (and the flushes are redundant.)
+ */
int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (!dma_is_direct(ops) && ops->get_sgtable)
- return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
- attrs);
- return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size,
- attrs);
+ if (dma_is_direct(ops))
+ return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+ size, attrs);
+ if (!ops->get_sgtable)
+ return -ENXIO;
+ return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs);
}
EXPORT_SYMBOL(dma_get_sgtable_attrs);
+#ifdef CONFIG_MMU
+/*
+ * Return the page attributes used for mapping dma_alloc_* memory, either in
+ * kernel space if remapping is needed, or to userspace through dma_mmap_*.
+ */
+pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
+{
+ if (dev_is_dma_coherent(dev) ||
+ (IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
+ (attrs & DMA_ATTR_NON_CONSISTENT)))
+ return prot;
+#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
+ if (attrs & DMA_ATTR_WRITE_COMBINE)
+ return pgprot_writecombine(prot);
+#endif
+ return pgprot_dmacoherent(prot);
+}
+#endif /* CONFIG_MMU */
+
/*
* Create userspace mapping for the DMA-coherent memory.
*/
@@ -152,14 +188,14 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
void *cpu_addr, dma_addr_t dma_addr, size_t size,
unsigned long attrs)
{
-#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP
+#ifdef CONFIG_MMU
unsigned long user_count = vma_pages(vma);
unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
unsigned long off = vma->vm_pgoff;
unsigned long pfn;
int ret = -ENXIO;
- vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs);
+ vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
@@ -170,7 +206,11 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
if (!dev_is_dma_coherent(dev)) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
return -ENXIO;
+
+ /* If the PFN is not valid, we do not have a struct page */
pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
+ if (!pfn_valid(pfn))
+ return -ENXIO;
} else {
pfn = page_to_pfn(virt_to_page(cpu_addr));
}
@@ -179,8 +219,29 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
user_count << PAGE_SHIFT, vma->vm_page_prot);
#else
return -ENXIO;
-#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */
+#endif /* CONFIG_MMU */
+}
+
+/**
+ * dma_can_mmap - check if a given device supports dma_mmap_*
+ * @dev: device to check
+ *
+ * Returns %true if @dev supports dma_mmap_coherent() and dma_mmap_attrs() to
+ * map DMA allocations to userspace.
+ */
+bool dma_can_mmap(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (dma_is_direct(ops)) {
+ return IS_ENABLED(CONFIG_MMU) &&
+ (dev_is_dma_coherent(dev) ||
+ IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN));
+ }
+
+ return ops->mmap != NULL;
}
+EXPORT_SYMBOL_GPL(dma_can_mmap);
/**
* dma_mmap_attrs - map a coherent DMA allocation into user space
@@ -201,31 +262,15 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
{
const struct dma_map_ops *ops = get_dma_ops(dev);
- if (!dma_is_direct(ops) && ops->mmap)
- return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
- return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+ if (dma_is_direct(ops))
+ return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size,
+ attrs);
+ if (!ops->mmap)
+ return -ENXIO;
+ return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
}
EXPORT_SYMBOL(dma_mmap_attrs);
-static u64 dma_default_get_required_mask(struct device *dev)
-{
- u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT);
- u32 high_totalram = ((max_pfn - 1) >> (32 - PAGE_SHIFT));
- u64 mask;
-
- if (!high_totalram) {
- /* convert to mask just covering totalram */
- low_totalram = (1 << (fls(low_totalram) - 1));
- low_totalram += low_totalram - 1;
- mask = low_totalram;
- } else {
- high_totalram = (1 << (fls(high_totalram) - 1));
- high_totalram += high_totalram - 1;
- mask = (((u64)high_totalram) << 32) + 0xffffffff;
- }
- return mask;
-}
-
u64 dma_get_required_mask(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -234,7 +279,16 @@ u64 dma_get_required_mask(struct device *dev)
return dma_direct_get_required_mask(dev);
if (ops->get_required_mask)
return ops->get_required_mask(dev);
- return dma_default_get_required_mask(dev);
+
+ /*
+ * We require every DMA ops implementation to at least support a 32-bit
+ * DMA mask (and use bounce buffering if that isn't supported in
+ * hardware). As the direct mapping code has its own routine to
+ * actually report an optimal mask we default to 32-bit here as that
+ * is the right thing for most IOMMUs, and at least not actively
+ * harmful in general.
+ */
+ return DMA_BIT_MASK(32);
}
EXPORT_SYMBOL_GPL(dma_get_required_mask);
@@ -291,12 +345,6 @@ void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
}
EXPORT_SYMBOL(dma_free_attrs);
-static inline void dma_check_mask(struct device *dev, u64 mask)
-{
- if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1)))
- dev_warn(dev, "SME is active, device will require DMA bounce buffers\n");
-}
-
int dma_supported(struct device *dev, u64 mask)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -317,11 +365,16 @@ void arch_dma_set_mask(struct device *dev, u64 mask);
int dma_set_mask(struct device *dev, u64 mask)
{
+ /*
+ * Truncate the mask to the actually supported dma_addr_t width to
+ * avoid generating unsupportable addresses.
+ */
+ mask = (dma_addr_t)mask;
+
if (!dev->dma_mask || !dma_supported(dev, mask))
return -EIO;
arch_dma_set_mask(dev, mask);
- dma_check_mask(dev, mask);
*dev->dma_mask = mask;
return 0;
}
@@ -330,10 +383,15 @@ EXPORT_SYMBOL(dma_set_mask);
#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
+ /*
+ * Truncate the mask to the actually supported dma_addr_t width to
+ * avoid generating unsupportable addresses.
+ */
+ mask = (dma_addr_t)mask;
+
if (!dma_supported(dev, mask))
return -EIO;
- dma_check_mask(dev, mask);
dev->coherent_dma_mask = mask;
return 0;
}
@@ -367,3 +425,14 @@ size_t dma_max_mapping_size(struct device *dev)
return size;
}
EXPORT_SYMBOL_GPL(dma_max_mapping_size);
+
+unsigned long dma_get_merge_boundary(struct device *dev)
+{
+ const struct dma_map_ops *ops = get_dma_ops(dev);
+
+ if (!ops || !ops->get_merge_boundary)
+ return 0; /* can't merge */
+
+ return ops->get_merge_boundary(dev);
+}
+EXPORT_SYMBOL_GPL(dma_get_merge_boundary);
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 7a723194ecbe..ca4e5d44b571 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -11,13 +11,21 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
+struct page **dma_common_find_pages(void *cpu_addr)
+{
+ struct vm_struct *area = find_vm_area(cpu_addr);
+
+ if (!area || area->flags != VM_DMA_COHERENT)
+ return NULL;
+ return area->pages;
+}
+
static struct vm_struct *__dma_common_pages_remap(struct page **pages,
- size_t size, unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+ size_t size, pgprot_t prot, const void *caller)
{
struct vm_struct *area;
- area = get_vm_area_caller(size, vm_flags, caller);
+ area = get_vm_area_caller(size, VM_DMA_COHERENT, caller);
if (!area)
return NULL;
@@ -34,12 +42,11 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages,
* Cannot be used in non-sleeping contexts
*/
void *dma_common_pages_remap(struct page **pages, size_t size,
- unsigned long vm_flags, pgprot_t prot,
- const void *caller)
+ pgprot_t prot, const void *caller)
{
struct vm_struct *area;
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ area = __dma_common_pages_remap(pages, size, prot, caller);
if (!area)
return NULL;
@@ -53,7 +60,6 @@ void *dma_common_pages_remap(struct page **pages, size_t size,
* Cannot be used in non-sleeping contexts
*/
void *dma_common_contiguous_remap(struct page *page, size_t size,
- unsigned long vm_flags,
pgprot_t prot, const void *caller)
{
int i;
@@ -67,7 +73,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
for (i = 0; i < (size >> PAGE_SHIFT); i++)
pages[i] = nth_page(page, i);
- area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller);
+ area = __dma_common_pages_remap(pages, size, prot, caller);
kfree(pages);
@@ -79,11 +85,11 @@ void *dma_common_contiguous_remap(struct page *page, size_t size,
/*
* Unmaps a range previously mapped by dma_common_*_remap
*/
-void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags)
+void dma_common_free_remap(void *cpu_addr, size_t size)
{
- struct vm_struct *area = find_vm_area(cpu_addr);
+ struct page **pages = dma_common_find_pages(cpu_addr);
- if (!area || (area->flags & vm_flags) != vm_flags) {
+ if (!pages) {
WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr);
return;
}
@@ -105,7 +111,16 @@ static int __init early_coherent_pool(char *p)
}
early_param("coherent_pool", early_coherent_pool);
-int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
+static gfp_t dma_atomic_pool_gfp(void)
+{
+ if (IS_ENABLED(CONFIG_ZONE_DMA))
+ return GFP_DMA;
+ if (IS_ENABLED(CONFIG_ZONE_DMA32))
+ return GFP_DMA32;
+ return GFP_KERNEL;
+}
+
+static int __init dma_atomic_pool_init(void)
{
unsigned int pool_size_order = get_order(atomic_pool_size);
unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
@@ -117,7 +132,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
page = dma_alloc_from_contiguous(NULL, nr_pages,
pool_size_order, false);
else
- page = alloc_pages(gfp, pool_size_order);
+ page = alloc_pages(dma_atomic_pool_gfp(), pool_size_order);
if (!page)
goto out;
@@ -127,8 +142,9 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
if (!atomic_pool)
goto free_page;
- addr = dma_common_contiguous_remap(page, atomic_pool_size, VM_USERMAP,
- prot, __builtin_return_address(0));
+ addr = dma_common_contiguous_remap(page, atomic_pool_size,
+ pgprot_dmacoherent(PAGE_KERNEL),
+ __builtin_return_address(0));
if (!addr)
goto destroy_genpool;
@@ -143,7 +159,7 @@ int __init dma_atomic_pool_init(gfp_t gfp, pgprot_t prot)
return 0;
remove_mapping:
- dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP);
+ dma_common_free_remap(addr, atomic_pool_size);
destroy_genpool:
gen_pool_destroy(atomic_pool);
atomic_pool = NULL;
@@ -155,9 +171,13 @@ out:
atomic_pool_size / 1024);
return -ENOMEM;
}
+postcore_initcall(dma_atomic_pool_init);
bool dma_in_atomic_pool(void *start, size_t size)
{
+ if (unlikely(!atomic_pool))
+ return false;
+
return addr_in_gen_pool(atomic_pool, (unsigned long)start, size);
}
@@ -199,8 +219,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
size = PAGE_ALIGN(size);
- if (!gfpflags_allow_blocking(flags) &&
- !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) {
+ if (!gfpflags_allow_blocking(flags)) {
ret = dma_alloc_from_pool(size, &page, flags);
if (!ret)
return NULL;
@@ -214,14 +233,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
- if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
- ret = page; /* opaque cookie */
- goto done;
- }
-
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
- arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
+ ret = dma_common_contiguous_remap(page, size,
+ dma_pgprot(dev, PAGE_KERNEL, attrs),
__builtin_return_address(0));
if (!ret) {
__dma_direct_free_pages(dev, size, page);
@@ -237,10 +251,7 @@ done:
void arch_dma_free(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs)
{
- if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
- /* vaddr is a struct page cookie, not a kernel address */
- __dma_direct_free_pages(dev, size, vaddr);
- } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
+ if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
phys_addr_t phys = dma_to_phys(dev, dma_handle);
struct page *page = pfn_to_page(__phys_to_pfn(phys));
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 13f0cb080a4d..673a2cdb2656 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -129,15 +129,17 @@ setup_io_tlb_npages(char *str)
}
early_param("swiotlb", setup_io_tlb_npages);
+static bool no_iotlb_memory;
+
unsigned long swiotlb_nr_tbl(void)
{
- return io_tlb_nslabs;
+ return unlikely(no_iotlb_memory) ? 0 : io_tlb_nslabs;
}
EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
unsigned int swiotlb_max_segment(void)
{
- return max_segment;
+ return unlikely(no_iotlb_memory) ? 0 : max_segment;
}
EXPORT_SYMBOL_GPL(swiotlb_max_segment);
@@ -160,8 +162,6 @@ unsigned long swiotlb_size_or_default(void)
return size ? size : (IO_TLB_DEFAULT_SIZE);
}
-static bool no_iotlb_memory;
-
void swiotlb_print_info(void)
{
unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
@@ -317,6 +317,14 @@ swiotlb_late_init_with_default_size(size_t default_size)
return rc;
}
+static void swiotlb_cleanup(void)
+{
+ io_tlb_end = 0;
+ io_tlb_start = 0;
+ io_tlb_nslabs = 0;
+ max_segment = 0;
+}
+
int
swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
{
@@ -367,10 +375,7 @@ cleanup4:
sizeof(int)));
io_tlb_list = NULL;
cleanup3:
- io_tlb_end = 0;
- io_tlb_start = 0;
- io_tlb_nslabs = 0;
- max_segment = 0;
+ swiotlb_cleanup();
return -ENOMEM;
}
@@ -394,10 +399,7 @@ void __init swiotlb_exit(void)
memblock_free_late(io_tlb_start,
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
}
- io_tlb_start = 0;
- io_tlb_end = 0;
- io_tlb_nslabs = 0;
- max_segment = 0;
+ swiotlb_cleanup();
}
/*
@@ -442,7 +444,9 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
dma_addr_t tbl_dma_addr,
- phys_addr_t orig_addr, size_t size,
+ phys_addr_t orig_addr,
+ size_t mapping_size,
+ size_t alloc_size,
enum dma_data_direction dir,
unsigned long attrs)
{
@@ -459,8 +463,13 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
if (mem_encrypt_active())
- pr_warn_once("%s is active and system is using DMA bounce buffers\n",
- sme_active() ? "SME" : "SEV");
+ pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
+
+ if (mapping_size > alloc_size) {
+ dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
+ mapping_size, alloc_size);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
+ }
mask = dma_get_seg_boundary(hwdev);
@@ -469,8 +478,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
/*
- * Carefully handle integer overflow which can occur when mask == ~0UL.
- */
+ * Carefully handle integer overflow which can occur when mask == ~0UL.
+ */
max_slots = mask + 1
? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT
: 1UL << (BITS_PER_LONG - IO_TLB_SHIFT);
@@ -479,8 +488,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
* For mappings greater than or equal to a page, we limit the stride
* (and hence alignment) to a page size.
*/
- nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- if (size >= PAGE_SIZE)
+ nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ if (alloc_size >= PAGE_SIZE)
stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT));
else
stride = 1;
@@ -545,8 +554,8 @@ not_found:
spin_unlock_irqrestore(&io_tlb_lock, flags);
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
- size, io_tlb_nslabs, tmp_io_tlb_used);
- return DMA_MAPPING_ERROR;
+ alloc_size, io_tlb_nslabs, tmp_io_tlb_used);
+ return (phys_addr_t)DMA_MAPPING_ERROR;
found:
io_tlb_used += nslots;
spin_unlock_irqrestore(&io_tlb_lock, flags);
@@ -560,7 +569,7 @@ found:
io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
@@ -569,11 +578,11 @@ found:
* tlb_addr is the physical address of the bounce buffer to unmap.
*/
void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t size, enum dma_data_direction dir,
- unsigned long attrs)
+ size_t mapping_size, size_t alloc_size,
+ enum dma_data_direction dir, unsigned long attrs)
{
unsigned long flags;
- int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
+ int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = io_tlb_orig_addr[index];
@@ -583,7 +592,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
if (orig_addr != INVALID_PHYS_ADDR &&
!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
- swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_FROM_DEVICE);
/*
* Return the buffer to the free list by setting the corresponding
@@ -663,14 +672,14 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
/* Oh well, have to allocate and map a bounce buffer. */
*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
- *phys, size, dir, attrs);
- if (*phys == DMA_MAPPING_ERROR)
+ *phys, size, size, dir, attrs);
+ if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
return false;
/* Ensure that the address returned is DMA'ble */
*dma_addr = __phys_to_dma(dev, *phys);
if (unlikely(!dma_capable(dev, *dma_addr, size))) {
- swiotlb_tbl_unmap_single(dev, *phys, size, dir,
+ swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
return false;
}
@@ -696,29 +705,12 @@ bool is_swiotlb_active(void)
static int __init swiotlb_create_debugfs(void)
{
- struct dentry *d_swiotlb_usage;
- struct dentry *ent;
-
- d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL);
-
- if (!d_swiotlb_usage)
- return -ENOMEM;
-
- ent = debugfs_create_ulong("io_tlb_nslabs", 0400,
- d_swiotlb_usage, &io_tlb_nslabs);
- if (!ent)
- goto fail;
-
- ent = debugfs_create_ulong("io_tlb_used", 0400,
- d_swiotlb_usage, &io_tlb_used);
- if (!ent)
- goto fail;
+ struct dentry *root;
+ root = debugfs_create_dir("swiotlb", NULL);
+ debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs);
+ debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used);
return 0;
-
-fail:
- debugfs_remove_recursive(d_swiotlb_usage);
- return -ENOMEM;
}
late_initcall(swiotlb_create_debugfs);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 785d708f8553..4f08b17d6426 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1103,7 +1103,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
raw_spin_lock_init(&cpuctx->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
@@ -1121,7 +1121,7 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
if (!cpuctx->hrtimer_active) {
cpuctx->hrtimer_active = 1;
hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
@@ -1887,6 +1887,89 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->generation++;
}
+static int
+perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
+{
+ if (!has_aux(aux_event))
+ return 0;
+
+ if (!event->pmu->aux_output_match)
+ return 0;
+
+ return event->pmu->aux_output_match(aux_event);
+}
+
+static void put_event(struct perf_event *event);
+static void event_sched_out(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx);
+
+static void perf_put_aux_event(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *iter;
+
+ /*
+ * If event uses aux_event tear down the link
+ */
+ if (event->aux_event) {
+ iter = event->aux_event;
+ event->aux_event = NULL;
+ put_event(iter);
+ return;
+ }
+
+ /*
+ * If the event is an aux_event, tear down all links to
+ * it from other events.
+ */
+ for_each_sibling_event(iter, event->group_leader) {
+ if (iter->aux_event != event)
+ continue;
+
+ iter->aux_event = NULL;
+ put_event(event);
+
+ /*
+ * If it's ACTIVE, schedule it out and put it into ERROR
+ * state so that we don't try to schedule it again. Note
+ * that perf_event_enable() will clear the ERROR status.
+ */
+ event_sched_out(iter, cpuctx, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ }
+}
+
+static int perf_get_aux_event(struct perf_event *event,
+ struct perf_event *group_leader)
+{
+ /*
+ * Our group leader must be an aux event if we want to be
+ * an aux_output. This way, the aux event will precede its
+ * aux_output events in the group, and therefore will always
+ * schedule first.
+ */
+ if (!group_leader)
+ return 0;
+
+ if (!perf_aux_output_match(event, group_leader))
+ return 0;
+
+ if (!atomic_long_inc_not_zero(&group_leader->refcount))
+ return 0;
+
+ /*
+ * Link aux_outputs to their aux event; this is undone in
+ * perf_group_detach() by perf_put_aux_event(). When the
+ * group in torn down, the aux_output events loose their
+ * link to the aux_event and can't schedule any more.
+ */
+ event->aux_event = group_leader;
+
+ return 1;
+}
+
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
@@ -1902,6 +1985,8 @@ static void perf_group_detach(struct perf_event *event)
event->attach_state &= ~PERF_ATTACH_GROUP;
+ perf_put_aux_event(event);
+
/*
* If this is a sibling, remove it from its group.
*/
@@ -2553,6 +2638,9 @@ unlock:
return ret;
}
+static bool exclusive_event_installable(struct perf_event *event,
+ struct perf_event_context *ctx);
+
/*
* Attach a performance event to a context.
*
@@ -2567,6 +2655,8 @@ perf_install_in_context(struct perf_event_context *ctx,
lockdep_assert_held(&ctx->mutex);
+ WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
if (event->cpu != -1)
event->cpu = cpu;
@@ -4084,10 +4174,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
return NULL;
__perf_event_init_context(ctx);
- if (task) {
- ctx->task = task;
- get_task_struct(task);
- }
+ if (task)
+ ctx->task = get_task_struct(task);
ctx->pmu = pmu;
return ctx;
@@ -4360,7 +4448,7 @@ static int exclusive_event_init(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return 0;
/*
@@ -4391,7 +4479,7 @@ static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return;
/* see comment in exclusive_event_init() */
@@ -4411,14 +4499,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
return false;
}
-/* Called under the same ctx::mutex as perf_install_in_context() */
static bool exclusive_event_installable(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *iter_event;
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ lockdep_assert_held(&ctx->mutex);
+
+ if (!is_exclusive_pmu(pmu))
return true;
list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -4465,12 +4554,20 @@ static void _free_event(struct perf_event *event)
if (event->destroy)
event->destroy(event);
- if (event->ctx)
- put_ctx(event->ctx);
-
+ /*
+ * Must be after ->destroy(), due to uprobe_perf_close() using
+ * hw.target.
+ */
if (event->hw.target)
put_task_struct(event->hw.target);
+ /*
+ * perf_event_free_task() relies on put_ctx() being 'last', in particular
+ * all task references must be cleaned up.
+ */
+ if (event->ctx)
+ put_ctx(event->ctx);
+
exclusive_event_destroy(event);
module_put(event->pmu->module);
@@ -4650,8 +4747,17 @@ again:
mutex_unlock(&event->child_mutex);
list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ void *var = &child->ctx->refcount;
+
list_del(&child->child_list);
free_event(child);
+
+ /*
+ * Wake any perf_event_free_task() waiting for this event to be
+ * freed.
+ */
+ smp_mb(); /* pairs with wait_var_event() */
+ wake_up_var(var);
}
no_ctx:
@@ -9468,7 +9574,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
period = max_t(u64, 10000, hwc->sample_period);
}
hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -9490,7 +9596,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hwc->hrtimer.function = perf_swevent_hrtimer;
/*
@@ -10332,8 +10438,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
- get_task_struct(task);
- event->hw.target = task;
+ event->hw.target = get_task_struct(task);
}
event->clock = &local_clock;
@@ -10403,6 +10508,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_ns;
}
+ if (event->attr.aux_output &&
+ !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
+ err = -EOPNOTSUPP;
+ goto err_pmu;
+ }
+
err = exclusive_event_init(event);
if (err)
goto err_pmu;
@@ -10930,11 +11041,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_alloc;
}
- if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
- err = -EBUSY;
- goto err_context;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -11022,6 +11128,18 @@ SYSCALL_DEFINE5(perf_event_open,
move_group = 0;
}
}
+
+ /*
+ * Failure to create exclusive events returns -EBUSY.
+ */
+ err = -EBUSY;
+ if (!exclusive_event_installable(group_leader, ctx))
+ goto err_locked;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (!exclusive_event_installable(sibling, ctx))
+ goto err_locked;
+ }
} else {
mutex_lock(&ctx->mutex);
}
@@ -11052,15 +11170,14 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (event->attr.aux_output && !perf_get_aux_event(event, group_leader))
+ goto err_locked;
/*
* Must be under the same ctx::mutex as perf_install_in_context(),
* because we need to serialize with concurrent event creation.
*/
if (!exclusive_event_installable(event, ctx)) {
- /* exclusive and group stuff are assumed mutually exclusive */
- WARN_ON_ONCE(move_group);
-
err = -EBUSY;
goto err_locked;
}
@@ -11247,7 +11364,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
- perf_install_in_context(ctx, event, cpu);
+ perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
@@ -11527,11 +11644,11 @@ static void perf_free_event(struct perf_event *event,
}
/*
- * Free an unexposed, unused context as created by inheritance by
- * perf_event_init_task below, used by fork() in case of fail.
+ * Free a context as created by inheritance by perf_event_init_task() below,
+ * used by fork() in case of fail.
*
- * Not all locks are strictly required, but take them anyway to be nice and
- * help out with the lockdep assertions.
+ * Even though the task has never lived, the context and events have been
+ * exposed through the child_list, so we must take care tearing it all down.
*/
void perf_event_free_task(struct task_struct *task)
{
@@ -11561,7 +11678,23 @@ void perf_event_free_task(struct task_struct *task)
perf_free_event(event, ctx);
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
}
@@ -11575,9 +11708,7 @@ void perf_event_delayed_put(struct task_struct *task)
struct file *perf_event_get(unsigned int fd)
{
- struct file *file;
-
- file = fget_raw(fd);
+ struct file *file = fget(fd);
if (!file)
return ERR_PTR(-EBADF);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index c5cd852fe86b..3cc8416ec844 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -413,7 +413,7 @@ static int hw_breakpoint_parse(struct perf_event *bp,
int register_perf_hw_breakpoint(struct perf_event *bp)
{
- struct arch_hw_breakpoint hw;
+ struct arch_hw_breakpoint hw = { };
int err;
err = reserve_bp_slot(bp);
@@ -461,7 +461,7 @@ int
modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr,
bool check)
{
- struct arch_hw_breakpoint hw;
+ struct arch_hw_breakpoint hw = { };
int err;
err = hw_breakpoint_parse(bp, attr, &hw);
diff --git a/kernel/exit.c b/kernel/exit.c
index a75b6a7f458a..22ab6a4bdc51 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -720,6 +720,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
+ tsk->exit_state = EXIT_ZOMBIE;
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -733,9 +734,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
autoreap = true;
}
- tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
- if (tsk->exit_state == EXIT_DEAD)
+ if (autoreap) {
+ tsk->exit_state = EXIT_DEAD;
list_add(&tsk->ptrace_entry, &dead);
+ }
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
@@ -1552,6 +1554,23 @@ end:
return retval;
}
+static struct pid *pidfd_get_pid(unsigned int fd)
+{
+ struct fd f;
+ struct pid *pid;
+
+ f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ pid = pidfd_pid(f.file);
+ if (!IS_ERR(pid))
+ get_pid(pid);
+
+ fdput(f);
+ return pid;
+}
+
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
@@ -1574,19 +1593,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
type = PIDTYPE_PID;
if (upid <= 0)
return -EINVAL;
+
+ pid = find_get_pid(upid);
break;
case P_PGID:
type = PIDTYPE_PGID;
- if (upid <= 0)
+ if (upid < 0)
return -EINVAL;
+
+ if (upid)
+ pid = find_get_pid(upid);
+ else
+ pid = get_task_pid(current, PIDTYPE_PGID);
+ break;
+ case P_PIDFD:
+ type = PIDTYPE_PID;
+ if (upid < 0)
+ return -EINVAL;
+
+ pid = pidfd_get_pid(upid);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
break;
default:
return -EINVAL;
}
- if (type < PIDTYPE_MAX)
- pid = find_get_pid(upid);
-
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options;
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..f6c9406eec7d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,20 @@ void __init sort_main_extable(void)
}
}
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+ return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
/* Given an address, look for it in the exception tables. */
const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
- e = search_extable(__start___ex_table,
- __stop___ex_table - __start___ex_table, addr);
+ e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
return e;
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index feb80712b913..63b349168da7 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -152,20 +152,13 @@ static int fei_retval_get(void *data, u64 *val)
DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
"%llx\n");
-static int fei_debugfs_add_attr(struct fei_attr *attr)
+static void fei_debugfs_add_attr(struct fei_attr *attr)
{
struct dentry *dir;
dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
- if (!dir)
- return -ENOMEM;
-
- if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
- debugfs_remove_recursive(dir);
- return -ENOMEM;
- }
- return 0;
+ debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops);
}
static void fei_debugfs_remove_attr(struct fei_attr *attr)
@@ -306,7 +299,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
ret = register_kprobe(&attr->kp);
if (!ret)
- ret = fei_debugfs_add_attr(attr);
+ fei_debugfs_add_attr(attr);
if (ret < 0)
fei_attr_remove(attr);
else {
@@ -337,19 +330,13 @@ static int __init fei_debugfs_init(void)
return PTR_ERR(dir);
/* injectable attribute is just a symlink of error_inject/list */
- if (!debugfs_create_symlink("injectable", dir,
- "../error_injection/list"))
- goto error;
+ debugfs_create_symlink("injectable", dir, "../error_injection/list");
- if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
- goto error;
+ debugfs_create_file("inject", 0600, dir, NULL, &fei_ops);
fei_debugfs_dir = dir;
return 0;
-error:
- debugfs_remove_recursive(dir);
- return -ENOMEM;
}
late_initcall(fei_debugfs_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8f3e2d97d771..5a0fd518e04e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -677,7 +677,6 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
- hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
@@ -727,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(tsk == current);
cgroup_free(tsk);
- task_numa_free(tsk);
+ task_numa_free(tsk, true);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
@@ -769,6 +768,7 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
+#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
@@ -783,6 +783,7 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
else
*offset += offsetof(struct task_struct, thread);
}
+#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
@@ -1008,7 +1009,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_owner(mm, p);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
- hmm_mm_init(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
@@ -1518,28 +1518,17 @@ void __cleanup_sighand(struct sighand_struct *sighand)
}
}
-#ifdef CONFIG_POSIX_TIMERS
/*
* Initialize POSIX timer handling for a thread group.
*/
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
+ struct posix_cputimers *pct = &sig->posix_cputimers;
unsigned long cpu_limit;
cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
- if (cpu_limit != RLIM_INFINITY) {
- sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC;
- sig->cputimer.running = true;
- }
-
- /* The timer lists. */
- INIT_LIST_HEAD(&sig->cpu_timers[0]);
- INIT_LIST_HEAD(&sig->cpu_timers[1]);
- INIT_LIST_HEAD(&sig->cpu_timers[2]);
+ posix_cputimers_group_init(pct, cpu_limit);
}
-#else
-static inline void posix_cpu_timers_init_group(struct signal_struct *sig) { }
-#endif
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
@@ -1641,23 +1630,6 @@ static void rt_mutex_init_task(struct task_struct *p)
#endif
}
-#ifdef CONFIG_POSIX_TIMERS
-/*
- * Initialize POSIX timer handling for a single task.
- */
-static void posix_cpu_timers_init(struct task_struct *tsk)
-{
- tsk->cputime_expires.prof_exp = 0;
- tsk->cputime_expires.virt_exp = 0;
- tsk->cputime_expires.sched_exp = 0;
- INIT_LIST_HEAD(&tsk->cpu_timers[0]);
- INIT_LIST_HEAD(&tsk->cpu_timers[1]);
- INIT_LIST_HEAD(&tsk->cpu_timers[2]);
-}
-#else
-static inline void posix_cpu_timers_init(struct task_struct *tsk) { }
-#endif
-
static inline void init_task_pid_links(struct task_struct *task)
{
enum pid_type type;
@@ -1691,6 +1663,14 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_RCU */
}
+struct pid *pidfd_pid(const struct file *file)
+{
+ if (file->f_op == &pidfd_fops)
+ return file->private_data;
+
+ return ERR_PTR(-EBADF);
+}
+
static int pidfd_release(struct inode *inode, struct file *file)
{
struct pid *pid = file->private_data;
@@ -1936,7 +1916,7 @@ static __latent_entropy struct task_struct *copy_process(
task_io_accounting_init(&p->ioac);
acct_clear_integrals(p);
- posix_cpu_timers_init(p);
+ posix_cputimers_init(&p->posix_cputimers);
p->io_context = NULL;
audit_set_context(p, NULL);
@@ -2339,6 +2319,8 @@ struct mm_struct *copy_init_mm(void)
*
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
+ *
+ * args->exit_signal is expected to be checked for sanity by the caller.
*/
long _do_fork(struct kernel_clone_args *args)
{
@@ -2406,6 +2388,16 @@ long _do_fork(struct kernel_clone_args *args)
return nr;
}
+bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
+{
+ /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+ if ((kargs->flags & CLONE_PIDFD) &&
+ (kargs->flags & CLONE_PARENT_SETTID))
+ return false;
+
+ return true;
+}
+
#ifndef CONFIG_HAVE_COPY_THREAD_TLS
/* For compatibility with architectures that call do_fork directly rather than
* using the syscall entry points below. */
@@ -2417,6 +2409,7 @@ long do_fork(unsigned long clone_flags,
{
struct kernel_clone_args args = {
.flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
.child_tid = child_tidptr,
.parent_tid = parent_tidptr,
.exit_signal = (clone_flags & CSIGNAL),
@@ -2424,6 +2417,9 @@ long do_fork(unsigned long clone_flags,
.stack_size = stack_size,
};
+ if (!legacy_clone_args_valid(&args))
+ return -EINVAL;
+
return _do_fork(&args);
}
#endif
@@ -2505,8 +2501,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
.tls = tls,
};
- /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
- if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+ if (!legacy_clone_args_valid(&args))
return -EINVAL;
return _do_fork(&args);
@@ -2550,6 +2545,14 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
if (copy_from_user(&args, uargs, size))
return -EFAULT;
+ /*
+ * Verify that higher 32bits of exit_signal are unset and that
+ * it is a valid signal
+ */
+ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
+ !valid_signal(args.exit_signal)))
+ return -EINVAL;
+
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d50728ef2e7..bd18f60e4c6c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -487,11 +487,9 @@ futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
if (!time)
return NULL;
- hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(timeout, current);
-
+ hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
/*
* If range_ns is 0, calling hrtimer_set_expires_range_ns() is
* effectively the same as calling hrtimer_set_expires().
@@ -2613,7 +2611,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
/* Arm the timer */
if (timeout)
- hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+ hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);
/*
* If we have been removed from the hash list, then another task
@@ -2899,7 +2897,7 @@ retry_private:
}
if (unlikely(to))
- hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+ hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3941a9c48f83..060e8e726755 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- select CONSTRUCTORS if !UML
+ select CONSTRUCTORS
default n
---help---
This option enables gcov-based code profiling (e.g. for code coverage
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 6e40ff6be083..e5eb5ea7ea59 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -64,7 +64,6 @@ struct gcov_node {
static const char objtree[] = OBJTREE;
static const char srctree[] = SRCTREE;
static struct gcov_node root_node;
-static struct dentry *reset_dentry;
static LIST_HEAD(all_head);
static DEFINE_MUTEX(node_lock);
@@ -387,8 +386,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent)
goto out_err;
node->links[i] = debugfs_create_symlink(deskew(basename),
parent, target);
- if (!node->links[i])
- goto out_err;
kfree(target);
}
@@ -450,11 +447,6 @@ static struct gcov_node *new_node(struct gcov_node *parent,
parent->dentry, node, &gcov_data_fops);
} else
node->dentry = debugfs_create_dir(node->name, parent->dentry);
- if (!node->dentry) {
- pr_warn("could not create file\n");
- kfree(node);
- return NULL;
- }
if (info)
add_links(node, parent->dentry);
list_add(&node->list, &parent->children);
@@ -761,32 +753,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
/* Create debugfs entries. */
static __init int gcov_fs_init(void)
{
- int rc = -EIO;
-
init_node(&root_node, NULL, NULL, NULL);
/*
* /sys/kernel/debug/gcov will be parent for the reset control file
* and all profiling files.
*/
root_node.dentry = debugfs_create_dir("gcov", NULL);
- if (!root_node.dentry)
- goto err_remove;
/*
* Create reset file which resets all profiling counts when written
* to.
*/
- reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
- NULL, &gcov_reset_fops);
- if (!reset_dentry)
- goto err_remove;
+ debugfs_create_file("reset", 0600, root_node.dentry, NULL,
+ &gcov_reset_fops);
/* Replay previous events to get our fs hierarchy up-to-date. */
gcov_enable_events();
return 0;
-
-err_remove:
- pr_err("init failed\n");
- debugfs_remove(root_node.dentry);
-
- return rc;
}
device_initcall(gcov_fs_init);
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 9a34e1d9bd7f..9ff449888d9c 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -4,24 +4,12 @@
# This script generates an archive consisting of kernel headers
# for CONFIG_IKHEADERS.
set -e
-spath="$(dirname "$(readlink -f "$0")")"
-kroot="$spath/.."
+sfile="$(readlink -f "$0")"
outdir="$(pwd)"
tarfile=$1
cpio_dir=$outdir/$tarfile.tmp
-# Script filename relative to the kernel source root
-# We add it to the archive because it is small and any changes
-# to this script will also cause a rebuild of the archive.
-sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
-
-src_file_list="
-include/
-arch/$SRCARCH/include/
-$sfile
-"
-
-obj_file_list="
+dir_list="
include/
arch/$SRCARCH/include/
"
@@ -33,33 +21,29 @@ arch/$SRCARCH/include/
# Uncomment it for debugging.
# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
-# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
-# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
+# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter
+# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter
# include/generated/compile.h is ignored because it is touched even when none
# of the source files changed. This causes pointless regeneration, so let us
# ignore them for md5 calculation.
-pushd $kroot > /dev/null
-src_files_md5="$(find $src_file_list -type f |
+pushd $srctree > /dev/null
+src_files_md5="$(find $dir_list -name "*.h" |
grep -v "include/generated/compile.h" |
grep -v "include/generated/autoconf.h" |
- grep -v "include/config/auto.conf" |
- grep -v "include/config/auto.conf.cmd" |
- grep -v "include/config/tristate.conf" |
- xargs ls -lR | md5sum | cut -d ' ' -f1)"
+ xargs ls -l | md5sum | cut -d ' ' -f1)"
popd > /dev/null
-obj_files_md5="$(find $obj_file_list -type f |
+obj_files_md5="$(find $dir_list -name "*.h" |
grep -v "include/generated/compile.h" |
grep -v "include/generated/autoconf.h" |
- grep -v "include/config/auto.conf" |
- grep -v "include/config/auto.conf.cmd" |
- grep -v "include/config/tristate.conf" |
- xargs ls -lR | md5sum | cut -d ' ' -f1)"
-
+ xargs ls -l | md5sum | cut -d ' ' -f1)"
+# Any changes to this script will also cause a rebuild of the archive.
+this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
if [ -f kernel/kheaders.md5 ] &&
[ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
[ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] &&
[ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
exit
fi
@@ -71,16 +55,16 @@ fi
rm -rf $cpio_dir
mkdir $cpio_dir
-pushd $kroot > /dev/null
-for f in $src_file_list;
- do find "$f" ! -name "*.cmd" ! -name ".*";
+pushd $srctree > /dev/null
+for f in $dir_list;
+ do find "$f" -name "*.h";
done | cpio --quiet -pd $cpio_dir
popd > /dev/null
# The second CPIO can complain if files already exist which can
# happen with out of tree builds. Just silence CPIO for now.
-for f in $obj_file_list;
- do find "$f" ! -name "*.cmd" ! -name ".*";
+for f in $dir_list;
+ do find "$f" -name "*.h";
done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
# Remove comments except SDPX lines
@@ -91,6 +75,7 @@ tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
echo "$src_files_md5" > kernel/kheaders.md5
echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$this_file_md5" >> kernel/kheaders.md5
echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
rm -rf $cpio_dir
diff --git a/kernel/iomem.c b/kernel/iomem.c
index 93c264444510..62c92e43aa0d 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap);
void memunmap(void *addr)
{
- if (is_vmalloc_addr(addr))
+ if (is_ioremap_addr(addr))
iounmap((void __iomem *) addr);
}
EXPORT_SYMBOL(memunmap);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4352b08ae48d..4d89ad4fae3b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/sort.h>
static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
unsigned int cpus_per_vec)
@@ -94,6 +95,155 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}
+struct node_vectors {
+ unsigned id;
+
+ union {
+ unsigned nvectors;
+ unsigned ncpus;
+ };
+};
+
+static int ncpus_cmp_func(const void *l, const void *r)
+{
+ const struct node_vectors *ln = l;
+ const struct node_vectors *rn = r;
+
+ return ln->ncpus - rn->ncpus;
+}
+
+/*
+ * Allocate vector number for each node, so that for each node:
+ *
+ * 1) the allocated number is >= 1
+ *
+ * 2) the allocated numbver is <= active CPU number of this node
+ *
+ * The actual allocated total vectors may be less than @numvecs when
+ * active total CPU number is less than @numvecs.
+ *
+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
+ * for each node.
+ */
+static void alloc_nodes_vectors(unsigned int numvecs,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ const nodemask_t nodemsk,
+ struct cpumask *nmsk,
+ struct node_vectors *node_vectors)
+{
+ unsigned n, remaining_ncpus = 0;
+
+ for (n = 0; n < nr_node_ids; n++) {
+ node_vectors[n].id = n;
+ node_vectors[n].ncpus = UINT_MAX;
+ }
+
+ for_each_node_mask(n, nodemsk) {
+ unsigned ncpus;
+
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ ncpus = cpumask_weight(nmsk);
+
+ if (!ncpus)
+ continue;
+ remaining_ncpus += ncpus;
+ node_vectors[n].ncpus = ncpus;
+ }
+
+ numvecs = min_t(unsigned, remaining_ncpus, numvecs);
+
+ sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
+ ncpus_cmp_func, NULL);
+
+ /*
+ * Allocate vectors for each node according to the ratio of this
+ * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
+ * bigger than number of active numa nodes. Always start the
+ * allocation from the node with minimized nr_cpus.
+ *
+ * This way guarantees that each active node gets allocated at
+ * least one vector, and the theory is simple: over-allocation
+ * is only done when this node is assigned by one vector, so
+ * other nodes will be allocated >= 1 vector, since 'numvecs' is
+ * bigger than number of numa nodes.
+ *
+ * One perfect invariant is that number of allocated vectors for
+ * each node is <= CPU count of this node:
+ *
+ * 1) suppose there are two nodes: A and B
+ * ncpu(X) is CPU count of node X
+ * vecs(X) is the vector count allocated to node X via this
+ * algorithm
+ *
+ * ncpu(A) <= ncpu(B)
+ * ncpu(A) + ncpu(B) = N
+ * vecs(A) + vecs(B) = V
+ *
+ * vecs(A) = max(1, round_down(V * ncpu(A) / N))
+ * vecs(B) = V - vecs(A)
+ *
+ * both N and V are integer, and 2 <= V <= N, suppose
+ * V = N - delta, and 0 <= delta <= N - 2
+ *
+ * 2) obviously vecs(A) <= ncpu(A) because:
+ *
+ * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
+ * ncpu(A) >= 1
+ *
+ * otherwise,
+ * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
+ *
+ * 3) prove how vecs(B) <= ncpu(B):
+ *
+ * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
+ * over-allocated, so vecs(B) <= ncpu(B),
+ *
+ * otherwise:
+ *
+ * vecs(A) =
+ * round_down(V * ncpu(A) / N) =
+ * round_down((N - delta) * ncpu(A) / N) =
+ * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
+ * round_down((N * ncpu(A) - delta * N) / N) =
+ * cpu(A) - delta
+ *
+ * then:
+ *
+ * vecs(A) - V >= ncpu(A) - delta - V
+ * =>
+ * V - vecs(A) <= V + delta - ncpu(A)
+ * =>
+ * vecs(B) <= N - ncpu(A)
+ * =>
+ * vecs(B) <= cpu(B)
+ *
+ * For nodes >= 3, it can be thought as one node and another big
+ * node given that is exactly what this algorithm is implemented,
+ * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
+ * finally for each node X: vecs(X) <= ncpu(X).
+ *
+ */
+ for (n = 0; n < nr_node_ids; n++) {
+ unsigned nvectors, ncpus;
+
+ if (node_vectors[n].ncpus == UINT_MAX)
+ continue;
+
+ WARN_ON_ONCE(numvecs == 0);
+
+ ncpus = node_vectors[n].ncpus;
+ nvectors = max_t(unsigned, 1,
+ numvecs * ncpus / remaining_ncpus);
+ WARN_ON_ONCE(nvectors > ncpus);
+
+ node_vectors[n].nvectors = nvectors;
+
+ remaining_ncpus -= ncpus;
+ numvecs -= nvectors;
+ }
+}
+
static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
unsigned int firstvec,
@@ -102,10 +252,11 @@ static int __irq_build_affinity_masks(unsigned int startvec,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
- unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
unsigned int last_affv = firstvec + numvecs;
unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
+ struct node_vectors *node_vectors;
if (!cpumask_weight(cpu_mask))
return 0;
@@ -126,42 +277,56 @@ static int __irq_build_affinity_masks(unsigned int startvec,
return numvecs;
}
- for_each_node_mask(n, nodemsk) {
- unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
+ node_vectors = kcalloc(nr_node_ids,
+ sizeof(struct node_vectors),
+ GFP_KERNEL);
+ if (!node_vectors)
+ return -ENOMEM;
- /* Spread the vectors per node */
- vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
+ /* allocate vector number for each node */
+ alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
+ nodemsk, nmsk, node_vectors);
- /* Get the cpus on this node which are in the mask */
- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ for (i = 0; i < nr_node_ids; i++) {
+ unsigned int ncpus, v;
+ struct node_vectors *nv = &node_vectors[i];
+
+ if (nv->nvectors == UINT_MAX)
+ continue;
- /* Calculate the number of cpus per vector */
+ /* Get the cpus on this node which are in the mask */
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
ncpus = cpumask_weight(nmsk);
- vecs_to_assign = min(vecs_per_node, ncpus);
+ if (!ncpus)
+ continue;
+
+ WARN_ON_ONCE(nv->nvectors > ncpus);
/* Account for rounding errors */
- extra_vecs = ncpus - vecs_to_assign * (ncpus / vecs_to_assign);
+ extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
- for (v = 0; curvec < last_affv && v < vecs_to_assign;
- curvec++, v++) {
- cpus_per_vec = ncpus / vecs_to_assign;
+ /* Spread allocated vectors on CPUs of the current node */
+ for (v = 0; v < nv->nvectors; v++, curvec++) {
+ cpus_per_vec = ncpus / nv->nvectors;
/* Account for extra vectors to compensate rounding errors */
if (extra_vecs) {
cpus_per_vec++;
--extra_vecs;
}
+
+ /*
+ * wrapping has to be considered given 'startvec'
+ * may start anywhere
+ */
+ if (curvec >= last_affv)
+ curvec = firstvec;
irq_spread_init_one(&masks[curvec].mask, nmsk,
cpus_per_vec);
}
-
- done += v;
- if (done >= numvecs)
- break;
- if (curvec >= last_affv)
- curvec = firstvec;
- --nodes;
+ done += nv->nvectors;
}
+ kfree(node_vectors);
return done;
}
@@ -174,7 +339,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
unsigned int firstvec,
struct irq_affinity_desc *masks)
{
- unsigned int curvec = startvec, nr_present, nr_others;
+ unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -189,15 +354,17 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
if (!node_to_cpumask)
goto fail_npresmsk;
- ret = 0;
/* Stabilize the cpumasks */
get_online_cpus();
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
- nr_present = __irq_build_affinity_masks(curvec, numvecs,
- firstvec, node_to_cpumask,
- cpu_present_mask, nmsk, masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
+ node_to_cpumask, cpu_present_mask,
+ nmsk, masks);
+ if (ret < 0)
+ goto fail_build_affinity;
+ nr_present = ret;
/*
* Spread on non present CPUs starting from the next vector to be
@@ -210,12 +377,16 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
else
curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- nr_others = __irq_build_affinity_masks(curvec, numvecs,
- firstvec, node_to_cpumask,
- npresmsk, nmsk, masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
+ node_to_cpumask, npresmsk, nmsk,
+ masks);
+ if (ret >= 0)
+ nr_others = ret;
+
+ fail_build_affinity:
put_online_cpus();
- if (nr_present < numvecs)
+ if (ret >= 0)
WARN_ON(nr_present + nr_others < numvecs);
free_node_to_cpumask(node_to_cpumask);
@@ -225,7 +396,7 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
fail_nmsk:
free_cpumask_var(nmsk);
- return ret;
+ return ret < 0 ? ret : 0;
}
static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
@@ -251,11 +422,9 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
* Determine the number of vectors which need interrupt affinities
* assigned. If the pre/post request exhausts the available vectors
* then nothing to do here except for invoking the calc_sets()
- * callback so the device driver can adjust to the situation. If there
- * is only a single vector, then managing the queue is pointless as
- * well.
+ * callback so the device driver can adjust to the situation.
*/
- if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
+ if (nvecs > affd->pre_vectors + affd->post_vectors)
affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
else
affvecs = 0;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9484e88dabc2..9be995fc3c5a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -295,6 +295,18 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
}
}
+static void irq_sysfs_del(struct irq_desc *desc)
+{
+ /*
+ * If irq_sysfs_init() has not yet been invoked (early boot), then
+ * irq_kobj_base is NULL and the descriptor was never added.
+ * kobject_del() complains about a object with no parent, so make
+ * it conditional.
+ */
+ if (irq_kobj_base)
+ kobject_del(&desc->kobj);
+}
+
static int __init irq_sysfs_init(void)
{
struct irq_desc *desc;
@@ -325,6 +337,7 @@ static struct kobj_type irq_kobj_type = {
};
static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
+static void irq_sysfs_del(struct irq_desc *desc) {}
#endif /* CONFIG_SYSFS */
@@ -438,7 +451,7 @@ static void free_desc(unsigned int irq)
* The sysfs entry must be serialized against a concurrent
* irq_sysfs_init() as well.
*/
- kobject_del(&desc->kobj);
+ irq_sysfs_del(desc);
delete_irq_desc(irq);
/*
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3078d0e48bba..132672b74e4b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -31,7 +31,7 @@ struct irqchip_fwid {
struct fwnode_handle fwnode;
unsigned int type;
char *name;
- void *data;
+ phys_addr_t *pa;
};
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
@@ -62,7 +62,8 @@ EXPORT_SYMBOL_GPL(irqchip_fwnode_ops);
* domain struct.
*/
struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
- const char *name, void *data)
+ const char *name,
+ phys_addr_t *pa)
{
struct irqchip_fwid *fwid;
char *n;
@@ -77,7 +78,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
n = kasprintf(GFP_KERNEL, "%s-%d", name, id);
break;
default:
- n = kasprintf(GFP_KERNEL, "irqchip@%p", data);
+ n = kasprintf(GFP_KERNEL, "irqchip@%pa", pa);
break;
}
@@ -89,7 +90,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id,
fwid->type = type;
fwid->name = n;
- fwid->data = data;
+ fwid->pa = pa;
fwid->fwnode.ops = &irqchip_fwnode_ops;
return &fwid->fwnode;
}
@@ -148,6 +149,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
switch (fwid->type) {
case IRQCHIP_FWNODE_NAMED:
case IRQCHIP_FWNODE_NAMED_ID:
+ domain->fwnode = fwnode;
domain->name = kstrdup(fwid->name, GFP_KERNEL);
if (!domain->name) {
kfree(domain);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e8f7f179bf77..1753486b440c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -23,7 +23,7 @@
#include "internals.h"
-#ifdef CONFIG_IRQ_FORCED_THREADING
+#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
__read_mostly bool force_irqthreads;
EXPORT_SYMBOL_GPL(force_irqthreads);
@@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
* the thread dies to avoid that the interrupt code
* references an already freed task_struct.
*/
- get_task_struct(t);
- new->thread = t;
+ new->thread = get_task_struct(t);
/*
* Tell the thread to set its affinity. This is
* important for shared interrupt handlers as we do
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d6961d3c6f9e..8f557fa1f4fe 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -177,6 +177,26 @@ static void resume_irqs(bool want_early)
}
/**
+ * rearm_wake_irq - rearm a wakeup interrupt line after signaling wakeup
+ * @irq: Interrupt to rearm
+ */
+void rearm_wake_irq(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
+
+ if (!desc || !(desc->istate & IRQS_SUSPENDED) ||
+ !irqd_is_wakeup_set(&desc->irq_data))
+ return;
+
+ desc->istate &= ~IRQS_SUSPENDED;
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ __enable_irq(desc);
+
+ irq_put_desc_busunlock(desc, flags);
+}
+
+/**
* irq_pm_syscore_ops - enable interrupt lines early
*
* Enable all interrupt lines with %IRQF_EARLY_RESUME set.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index da9addb8d655..cfc4f088a0e7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -100,10 +100,6 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
return 0;
}
-#ifndef is_affinity_mask_valid
-#define is_affinity_mask_valid(val) 1
-#endif
-
int no_irq_affinity;
static int irq_affinity_proc_show(struct seq_file *m, void *v)
{
@@ -136,11 +132,6 @@ static ssize_t write_irq_affinity(int type, struct file *file,
if (err)
goto free_cpumask;
- if (!is_affinity_mask_valid(new_value)) {
- err = -EINVAL;
- goto free_cpumask;
- }
-
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
@@ -232,11 +223,6 @@ static ssize_t default_affinity_write(struct file *file,
if (err)
goto out;
- if (!is_affinity_mask_valid(new_value)) {
- err = -EINVAL;
- goto out;
- }
-
/*
* Do not allow disabling IRQs completely - it's a too easy
* way to make the system unusable accidentally :-) At least
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 95414ad3506a..98c04ca5fa43 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -36,6 +36,8 @@ static void resend_irqs(unsigned long arg)
irq = find_first_bit(irqs_resend, nr_irqs);
clear_bit(irq, irqs_resend);
desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
local_irq_disable();
desc->handle_irq(desc);
local_irq_enable();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index df3008419a1d..cdb3ffab128b 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -407,7 +407,9 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
return false;
if (!kernel_text_address(jump_entry_code(entry))) {
- WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry));
+ WARN_ONCE(!jump_entry_is_init(entry),
+ "can't patch jump_label at %pS",
+ (void *)jump_entry_code(entry));
return false;
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 95a260f9214b..136ce049c4ad 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -263,8 +263,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
{
char namebuf[KSYM_NAME_LEN];
- if (is_ksym_addr(addr))
- return !!get_symbol_pos(addr, symbolsize, offset);
+ if (is_ksym_addr(addr)) {
+ get_symbol_pos(addr, symbolsize, offset);
+ return 1;
+ }
return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf) ||
!!__bpf_address_lookup(addr, symbolsize, offset, namebuf);
}
diff --git a/kernel/kexec_elf.c b/kernel/kexec_elf.c
new file mode 100644
index 000000000000..d3689632e8b9
--- /dev/null
+++ b/kernel/kexec_elf.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Load ELF vmlinux file for the kexec_file_load syscall.
+ *
+ * Copyright (C) 2004 Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004 IBM Corp.
+ * Copyright (C) 2005 R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006 Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2016 IBM Corporation
+ *
+ * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c.
+ * Heavily modified for the kernel by
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
+ */
+
+#define pr_fmt(fmt) "kexec_elf: " fmt
+
+#include <linux/elf.h>
+#include <linux/kexec.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+static inline bool elf_is_elf_file(const struct elfhdr *ehdr)
+{
+ return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0;
+}
+
+static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le64_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be64_to_cpu(value);
+
+ return value;
+}
+
+static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le32_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be32_to_cpu(value);
+
+ return value;
+}
+
+static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
+{
+ if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+ value = le16_to_cpu(value);
+ else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+ value = be16_to_cpu(value);
+
+ return value;
+}
+
+/**
+ * elf_is_ehdr_sane - check that it is safe to use the ELF header
+ * @buf_len: size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len)
+{
+ if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) {
+ pr_debug("Bad program header size.\n");
+ return false;
+ } else if (ehdr->e_shnum > 0 &&
+ ehdr->e_shentsize != sizeof(struct elf_shdr)) {
+ pr_debug("Bad section header size.\n");
+ return false;
+ } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT ||
+ ehdr->e_version != EV_CURRENT) {
+ pr_debug("Unknown ELF version.\n");
+ return false;
+ }
+
+ if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+ size_t phdr_size;
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+ /* Sanity check the program header table location. */
+ if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) {
+ pr_debug("Program headers at invalid location.\n");
+ return false;
+ } else if (ehdr->e_phoff + phdr_size > buf_len) {
+ pr_debug("Program headers truncated.\n");
+ return false;
+ }
+ }
+
+ if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
+ size_t shdr_size;
+
+ /*
+ * e_shnum is at most 65536 so calculating
+ * the size of the section header cannot overflow.
+ */
+ shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum;
+
+ /* Sanity check the section header table location. */
+ if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) {
+ pr_debug("Section headers at invalid location.\n");
+ return false;
+ } else if (ehdr->e_shoff + shdr_size > buf_len) {
+ pr_debug("Section headers truncated.\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr)
+{
+ struct elfhdr *buf_ehdr;
+
+ if (len < sizeof(*buf_ehdr)) {
+ pr_debug("Buffer is too small to hold ELF header.\n");
+ return -ENOEXEC;
+ }
+
+ memset(ehdr, 0, sizeof(*ehdr));
+ memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident));
+ if (!elf_is_elf_file(ehdr)) {
+ pr_debug("No ELF header magic.\n");
+ return -ENOEXEC;
+ }
+
+ if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) {
+ pr_debug("Not a supported ELF class.\n");
+ return -ENOEXEC;
+ } else if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB &&
+ ehdr->e_ident[EI_DATA] != ELFDATA2MSB) {
+ pr_debug("Not a supported ELF data format.\n");
+ return -ENOEXEC;
+ }
+
+ buf_ehdr = (struct elfhdr *) buf;
+ if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) {
+ pr_debug("Bad ELF header size.\n");
+ return -ENOEXEC;
+ }
+
+ ehdr->e_type = elf16_to_cpu(ehdr, buf_ehdr->e_type);
+ ehdr->e_machine = elf16_to_cpu(ehdr, buf_ehdr->e_machine);
+ ehdr->e_version = elf32_to_cpu(ehdr, buf_ehdr->e_version);
+ ehdr->e_flags = elf32_to_cpu(ehdr, buf_ehdr->e_flags);
+ ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize);
+ ehdr->e_phnum = elf16_to_cpu(ehdr, buf_ehdr->e_phnum);
+ ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize);
+ ehdr->e_shnum = elf16_to_cpu(ehdr, buf_ehdr->e_shnum);
+ ehdr->e_shstrndx = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx);
+
+ switch (ehdr->e_ident[EI_CLASS]) {
+ case ELFCLASS64:
+ ehdr->e_entry = elf64_to_cpu(ehdr, buf_ehdr->e_entry);
+ ehdr->e_phoff = elf64_to_cpu(ehdr, buf_ehdr->e_phoff);
+ ehdr->e_shoff = elf64_to_cpu(ehdr, buf_ehdr->e_shoff);
+ break;
+
+ case ELFCLASS32:
+ ehdr->e_entry = elf32_to_cpu(ehdr, buf_ehdr->e_entry);
+ ehdr->e_phoff = elf32_to_cpu(ehdr, buf_ehdr->e_phoff);
+ ehdr->e_shoff = elf32_to_cpu(ehdr, buf_ehdr->e_shoff);
+ break;
+
+ default:
+ pr_debug("Unknown ELF class.\n");
+ return -EINVAL;
+ }
+
+ return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_is_phdr_sane - check that it is safe to use the program header
+ * @buf_len: size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len)
+{
+
+ if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) {
+ pr_debug("ELF segment location wraps around.\n");
+ return false;
+ } else if (phdr->p_offset + phdr->p_filesz > buf_len) {
+ pr_debug("ELF segment not in file.\n");
+ return false;
+ } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) {
+ pr_debug("ELF segment address wraps around.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static int elf_read_phdr(const char *buf, size_t len,
+ struct kexec_elf_info *elf_info,
+ int idx)
+{
+ /* Override the const in proghdrs, we are the ones doing the loading. */
+ struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx];
+ const struct elfhdr *ehdr = elf_info->ehdr;
+ const char *pbuf;
+ struct elf_phdr *buf_phdr;
+
+ pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr));
+ buf_phdr = (struct elf_phdr *) pbuf;
+
+ phdr->p_type = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type);
+ phdr->p_flags = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags);
+
+ switch (ehdr->e_ident[EI_CLASS]) {
+ case ELFCLASS64:
+ phdr->p_offset = elf64_to_cpu(ehdr, buf_phdr->p_offset);
+ phdr->p_paddr = elf64_to_cpu(ehdr, buf_phdr->p_paddr);
+ phdr->p_vaddr = elf64_to_cpu(ehdr, buf_phdr->p_vaddr);
+ phdr->p_filesz = elf64_to_cpu(ehdr, buf_phdr->p_filesz);
+ phdr->p_memsz = elf64_to_cpu(ehdr, buf_phdr->p_memsz);
+ phdr->p_align = elf64_to_cpu(ehdr, buf_phdr->p_align);
+ break;
+
+ case ELFCLASS32:
+ phdr->p_offset = elf32_to_cpu(ehdr, buf_phdr->p_offset);
+ phdr->p_paddr = elf32_to_cpu(ehdr, buf_phdr->p_paddr);
+ phdr->p_vaddr = elf32_to_cpu(ehdr, buf_phdr->p_vaddr);
+ phdr->p_filesz = elf32_to_cpu(ehdr, buf_phdr->p_filesz);
+ phdr->p_memsz = elf32_to_cpu(ehdr, buf_phdr->p_memsz);
+ phdr->p_align = elf32_to_cpu(ehdr, buf_phdr->p_align);
+ break;
+
+ default:
+ pr_debug("Unknown ELF class.\n");
+ return -EINVAL;
+ }
+
+ return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_read_phdrs - read the program headers from the buffer
+ *
+ * This function assumes that the program header table was checked for sanity.
+ * Use elf_is_ehdr_sane() if it wasn't.
+ */
+static int elf_read_phdrs(const char *buf, size_t len,
+ struct kexec_elf_info *elf_info)
+{
+ size_t phdr_size, i;
+ const struct elfhdr *ehdr = elf_info->ehdr;
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+ elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL);
+ if (!elf_info->proghdrs)
+ return -ENOMEM;
+
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ int ret;
+
+ ret = elf_read_phdr(buf, len, elf_info, i);
+ if (ret) {
+ kfree(elf_info->proghdrs);
+ elf_info->proghdrs = NULL;
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info
+ * @buf: Buffer to read ELF file from.
+ * @len: Size of @buf.
+ * @ehdr: Pointer to existing struct which will be populated.
+ * @elf_info: Pointer to existing struct which will be populated.
+ *
+ * This function allows reading ELF files with different byte order than
+ * the kernel, byte-swapping the fields as needed.
+ *
+ * Return:
+ * On success returns 0, and the caller should call
+ * kexec_free_elf_info(elf_info) to free the memory allocated for the section
+ * and program headers.
+ */
+static int elf_read_from_buffer(const char *buf, size_t len,
+ struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info)
+{
+ int ret;
+
+ ret = elf_read_ehdr(buf, len, ehdr);
+ if (ret)
+ return ret;
+
+ elf_info->buffer = buf;
+ elf_info->ehdr = ehdr;
+ if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+ ret = elf_read_phdrs(buf, len, elf_info);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+/**
+ * kexec_free_elf_info - free memory allocated by elf_read_from_buffer
+ */
+void kexec_free_elf_info(struct kexec_elf_info *elf_info)
+{
+ kfree(elf_info->proghdrs);
+ memset(elf_info, 0, sizeof(*elf_info));
+}
+/**
+ * kexec_build_elf_info - read ELF executable and check that we can use it
+ */
+int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info)
+{
+ int i;
+ int ret;
+
+ ret = elf_read_from_buffer(buf, len, ehdr, elf_info);
+ if (ret)
+ return ret;
+
+ /* Big endian vmlinux has type ET_DYN. */
+ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) {
+ pr_err("Not an ELF executable.\n");
+ goto error;
+ } else if (!elf_info->proghdrs) {
+ pr_err("No ELF program header.\n");
+ goto error;
+ }
+
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ /*
+ * Kexec does not support loading interpreters.
+ * In addition this check keeps us from attempting
+ * to kexec ordinay executables.
+ */
+ if (elf_info->proghdrs[i].p_type == PT_INTERP) {
+ pr_err("Requires an ELF interpreter.\n");
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ kexec_free_elf_info(elf_info);
+ return -ENOEXEC;
+}
+
+
+int kexec_elf_probe(const char *buf, unsigned long len)
+{
+ struct elfhdr ehdr;
+ struct kexec_elf_info elf_info;
+ int ret;
+
+ ret = kexec_build_elf_info(buf, len, &ehdr, &elf_info);
+ if (ret)
+ return ret;
+
+ kexec_free_elf_info(&elf_info);
+
+ return elf_check_arch(&ehdr) ? 0 : -ENOEXEC;
+}
+
+/**
+ * kexec_elf_load - load ELF executable image
+ * @lowest_load_addr: On return, will be the address where the first PT_LOAD
+ * section will be loaded in memory.
+ *
+ * Return:
+ * 0 on success, negative value on failure.
+ */
+int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
+ struct kexec_elf_info *elf_info,
+ struct kexec_buf *kbuf,
+ unsigned long *lowest_load_addr)
+{
+ unsigned long lowest_addr = UINT_MAX;
+ int ret;
+ size_t i;
+
+ /* Read in the PT_LOAD segments. */
+ for (i = 0; i < ehdr->e_phnum; i++) {
+ unsigned long load_addr;
+ size_t size;
+ const struct elf_phdr *phdr;
+
+ phdr = &elf_info->proghdrs[i];
+ if (phdr->p_type != PT_LOAD)
+ continue;
+
+ size = phdr->p_filesz;
+ if (size > phdr->p_memsz)
+ size = phdr->p_memsz;
+
+ kbuf->buffer = (void *) elf_info->buffer + phdr->p_offset;
+ kbuf->bufsz = size;
+ kbuf->memsz = phdr->p_memsz;
+ kbuf->buf_align = phdr->p_align;
+ kbuf->buf_min = phdr->p_paddr;
+ kbuf->mem = KEXEC_BUF_MEM_UNKNOWN;
+ ret = kexec_add_buffer(kbuf);
+ if (ret)
+ goto out;
+ load_addr = kbuf->mem;
+
+ if (load_addr < lowest_addr)
+ lowest_addr = load_addr;
+ }
+
+ *lowest_load_addr = lowest_addr;
+ ret = 0;
+ out:
+ return ret;
+}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 445337c107e0..53534aa258a6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -470,6 +470,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
*/
static void do_optimize_kprobes(void)
{
+ lockdep_assert_held(&text_mutex);
/*
* The optimization/unoptimization refers online_cpus via
* stop_machine() and cpu-hotplug modifies online_cpus.
@@ -487,9 +488,7 @@ static void do_optimize_kprobes(void)
list_empty(&optimizing_list))
return;
- mutex_lock(&text_mutex);
arch_optimize_kprobes(&optimizing_list);
- mutex_unlock(&text_mutex);
}
/*
@@ -500,6 +499,7 @@ static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
+ lockdep_assert_held(&text_mutex);
/* See comment in do_optimize_kprobes() */
lockdep_assert_cpus_held();
@@ -507,7 +507,6 @@ static void do_unoptimize_kprobes(void)
if (list_empty(&unoptimizing_list))
return;
- mutex_lock(&text_mutex);
arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
list_for_each_entry_safe(op, tmp, &freeing_list, list) {
@@ -524,7 +523,6 @@ static void do_unoptimize_kprobes(void)
} else
list_del_init(&op->list);
}
- mutex_unlock(&text_mutex);
}
/* Reclaim all kprobes on the free_list */
@@ -556,6 +554,7 @@ static void kprobe_optimizer(struct work_struct *work)
{
mutex_lock(&kprobe_mutex);
cpus_read_lock();
+ mutex_lock(&text_mutex);
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
@@ -583,6 +582,7 @@ static void kprobe_optimizer(struct work_struct *work)
do_free_cleaned_kprobes();
mutex_unlock(&module_mutex);
+ mutex_unlock(&text_mutex);
cpus_read_unlock();
mutex_unlock(&kprobe_mutex);
@@ -962,8 +962,15 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
#ifdef CONFIG_KPROBES_ON_FTRACE
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
.func = kprobe_ftrace_handler,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
+};
+
+static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
+ .func = kprobe_ftrace_handler,
.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
};
+
+static int kprobe_ipmodify_enabled;
static int kprobe_ftrace_enabled;
/* Must ensure p->addr is really on ftrace */
@@ -976,58 +983,75 @@ static int prepare_kprobe(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
-static int arm_kprobe_ftrace(struct kprobe *p)
+static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
+ int *cnt)
{
int ret = 0;
- ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
- (unsigned long)p->addr, 0, 0);
+ ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0);
if (ret) {
pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n",
p->addr, ret);
return ret;
}
- if (kprobe_ftrace_enabled == 0) {
- ret = register_ftrace_function(&kprobe_ftrace_ops);
+ if (*cnt == 0) {
+ ret = register_ftrace_function(ops);
if (ret) {
pr_debug("Failed to init kprobe-ftrace (%d)\n", ret);
goto err_ftrace;
}
}
- kprobe_ftrace_enabled++;
+ (*cnt)++;
return ret;
err_ftrace:
/*
- * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a
- * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental
- * empty filter_hash which would undesirably trace all functions.
+ * At this point, sinec ops is not registered, we should be sefe from
+ * registering empty filter.
*/
- ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0);
+ ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
return ret;
}
+static int arm_kprobe_ftrace(struct kprobe *p)
+{
+ bool ipmodify = (p->post_handler != NULL);
+
+ return __arm_kprobe_ftrace(p,
+ ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
+ ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
+}
+
/* Caller must lock kprobe_mutex */
-static int disarm_kprobe_ftrace(struct kprobe *p)
+static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
+ int *cnt)
{
int ret = 0;
- if (kprobe_ftrace_enabled == 1) {
- ret = unregister_ftrace_function(&kprobe_ftrace_ops);
+ if (*cnt == 1) {
+ ret = unregister_ftrace_function(ops);
if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret))
return ret;
}
- kprobe_ftrace_enabled--;
+ (*cnt)--;
- ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
- (unsigned long)p->addr, 1, 0);
+ ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n",
p->addr, ret);
return ret;
}
+
+static int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ bool ipmodify = (p->post_handler != NULL);
+
+ return __disarm_kprobe_ftrace(p,
+ ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
+ ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
+}
#else /* !CONFIG_KPROBES_ON_FTRACE */
#define prepare_kprobe(p) arch_prepare_kprobe(p)
#define arm_kprobe_ftrace(p) (-ENODEV)
@@ -1514,7 +1538,8 @@ static int check_kprobe_address_safe(struct kprobe *p,
/* Ensure it is not in reserved area nor out of text */
if (!kernel_text_address((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
- jump_label_text_reserved(p->addr, p->addr)) {
+ jump_label_text_reserved(p->addr, p->addr) ||
+ find_bug((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
}
@@ -1906,7 +1931,7 @@ int register_kretprobe(struct kretprobe *rp)
/* Pre-allocate memory for max kretprobe instances */
if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
#else
rp->maxactive = num_possible_cpus();
@@ -2276,6 +2301,7 @@ static int __init init_kprobes(void)
init_test_probes();
return err;
}
+subsys_initcall(init_kprobes);
#ifdef CONFIG_DEBUG_FS
static void report_probe(struct seq_file *pi, struct kprobe *p,
@@ -2570,36 +2596,21 @@ static const struct file_operations fops_kp = {
static int __init debugfs_kprobe_init(void)
{
- struct dentry *dir, *file;
+ struct dentry *dir;
unsigned int value = 1;
dir = debugfs_create_dir("kprobes", NULL);
- if (!dir)
- return -ENOMEM;
- file = debugfs_create_file("list", 0400, dir, NULL,
- &debugfs_kprobes_operations);
- if (!file)
- goto error;
+ debugfs_create_file("list", 0400, dir, NULL,
+ &debugfs_kprobes_operations);
- file = debugfs_create_file("enabled", 0600, dir,
- &value, &fops_kp);
- if (!file)
- goto error;
+ debugfs_create_file("enabled", 0600, dir, &value, &fops_kp);
- file = debugfs_create_file("blacklist", 0400, dir, NULL,
- &debugfs_kprobe_blacklist_ops);
- if (!file)
- goto error;
+ debugfs_create_file("blacklist", 0400, dir, NULL,
+ &debugfs_kprobe_blacklist_ops);
return 0;
-
-error:
- debugfs_remove(dir);
- return -ENOMEM;
}
late_initcall(debugfs_kprobe_init);
#endif /* CONFIG_DEBUG_FS */
-
-module_init(init_kprobes);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 341f52117f88..233459c03b5a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -448,34 +448,102 @@ static void print_lockdep_off(const char *bug_msg)
unsigned long nr_stack_trace_entries;
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_PROVE_LOCKING
+/**
+ * struct lock_trace - single stack backtrace
+ * @hash_entry: Entry in a stack_trace_hash[] list.
+ * @hash: jhash() of @entries.
+ * @nr_entries: Number of entries in @entries.
+ * @entries: Actual stack backtrace.
+ */
+struct lock_trace {
+ struct hlist_node hash_entry;
+ u32 hash;
+ u32 nr_entries;
+ unsigned long entries[0] __aligned(sizeof(unsigned long));
+};
+#define LOCK_TRACE_SIZE_IN_LONGS \
+ (sizeof(struct lock_trace) / sizeof(unsigned long))
/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
+ * Stack-trace: sequence of lock_trace structures. Protected by the graph_lock.
*/
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static struct hlist_head stack_trace_hash[STACK_TRACE_HASH_SIZE];
+
+static bool traces_identical(struct lock_trace *t1, struct lock_trace *t2)
+{
+ return t1->hash == t2->hash && t1->nr_entries == t2->nr_entries &&
+ memcmp(t1->entries, t2->entries,
+ t1->nr_entries * sizeof(t1->entries[0])) == 0;
+}
-static int save_trace(struct lock_trace *trace)
+static struct lock_trace *save_trace(void)
{
- unsigned long *entries = stack_trace + nr_stack_trace_entries;
+ struct lock_trace *trace, *t2;
+ struct hlist_head *hash_head;
+ u32 hash;
unsigned int max_entries;
- trace->offset = nr_stack_trace_entries;
- max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
- trace->nr_entries = stack_trace_save(entries, max_entries, 3);
- nr_stack_trace_entries += trace->nr_entries;
+ BUILD_BUG_ON_NOT_POWER_OF_2(STACK_TRACE_HASH_SIZE);
+ BUILD_BUG_ON(LOCK_TRACE_SIZE_IN_LONGS >= MAX_STACK_TRACE_ENTRIES);
- if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
+ trace = (struct lock_trace *)(stack_trace + nr_stack_trace_entries);
+ max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries -
+ LOCK_TRACE_SIZE_IN_LONGS;
+ trace->nr_entries = stack_trace_save(trace->entries, max_entries, 3);
+
+ if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES -
+ LOCK_TRACE_SIZE_IN_LONGS - 1) {
if (!debug_locks_off_graph_unlock())
- return 0;
+ return NULL;
print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
- return 0;
+ return NULL;
}
- return 1;
+ hash = jhash(trace->entries, trace->nr_entries *
+ sizeof(trace->entries[0]), 0);
+ trace->hash = hash;
+ hash_head = stack_trace_hash + (hash & (STACK_TRACE_HASH_SIZE - 1));
+ hlist_for_each_entry(t2, hash_head, hash_entry) {
+ if (traces_identical(trace, t2))
+ return t2;
+ }
+ nr_stack_trace_entries += LOCK_TRACE_SIZE_IN_LONGS + trace->nr_entries;
+ hlist_add_head(&trace->hash_entry, hash_head);
+
+ return trace;
+}
+
+/* Return the number of stack traces in the stack_trace[] array. */
+u64 lockdep_stack_trace_count(void)
+{
+ struct lock_trace *trace;
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++) {
+ hlist_for_each_entry(trace, &stack_trace_hash[i], hash_entry) {
+ c++;
+ }
+ }
+
+ return c;
+}
+
+/* Return the number of stack hash chains that have at least one stack trace. */
+u64 lockdep_stack_hash_count(void)
+{
+ u64 c = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stack_trace_hash); i++)
+ if (!hlist_empty(&stack_trace_hash[i]))
+ c++;
+
+ return c;
}
#endif
@@ -491,7 +559,7 @@ unsigned int max_lockdep_depth;
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_PROVE_LOCKING
/*
* Locking printouts:
*/
@@ -511,7 +579,7 @@ static const char *usage_str[] =
};
#endif
-const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
+const char *__get_key_name(const struct lockdep_subclass_key *key, char *str)
{
return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
@@ -620,7 +688,7 @@ static void print_lock(struct held_lock *hlock)
return;
}
- printk(KERN_CONT "%p", hlock->instance);
+ printk(KERN_CONT "%px", hlock->instance);
print_lock_name(lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -1235,7 +1303,7 @@ static struct lock_list *alloc_list_entry(void)
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
unsigned long ip, int distance,
- struct lock_trace *trace)
+ const struct lock_trace *trace)
{
struct lock_list *entry;
/*
@@ -1249,7 +1317,7 @@ static int add_lock_to_list(struct lock_class *this,
entry->class = this;
entry->links_to = links_to;
entry->distance = distance;
- entry->trace = *trace;
+ entry->trace = trace;
/*
* Both allocation and removal are done under the graph lock; but
* iteration is under RCU-sched; see look_up_lock_class() and
@@ -1470,11 +1538,10 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
}
-static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
+static void print_lock_trace(const struct lock_trace *trace,
+ unsigned int spaces)
{
- unsigned long *entries = stack_trace + trace->offset;
-
- stack_trace_print(entries, trace->nr_entries, spaces);
+ stack_trace_print(trace->entries, trace->nr_entries, spaces);
}
/*
@@ -1489,7 +1556,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)
printk("\n-> #%u", depth);
print_lock_name(target->class);
printk(KERN_CONT ":\n");
- print_lock_trace(&target->trace, 6);
+ print_lock_trace(target->trace, 6);
}
static void
@@ -1592,7 +1659,8 @@ static noinline void print_circular_bug(struct lock_list *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return;
- if (!save_trace(&this->trace))
+ this->trace = save_trace();
+ if (!this->trace)
return;
depth = get_lock_depth(target);
@@ -1715,7 +1783,7 @@ check_path(struct lock_class *target, struct lock_list *src_entry,
*/
static noinline int
check_noncircular(struct held_lock *src, struct held_lock *target,
- struct lock_trace *trace)
+ struct lock_trace **const trace)
{
int ret;
struct lock_list *uninitialized_var(target_entry);
@@ -1729,13 +1797,13 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
ret = check_path(hlock_class(target), &src_entry, &target_entry);
if (unlikely(!ret)) {
- if (!trace->nr_entries) {
+ if (!*trace) {
/*
* If save_trace fails here, the printing might
* trigger a WARN but because of the !nr_entries it
* should not do bad things.
*/
- save_trace(trace);
+ *trace = save_trace();
}
print_circular_bug(&src_entry, target_entry, src, target);
@@ -1859,7 +1927,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
len += printk("%*s %s", depth, "", usage_str[bit]);
len += printk(KERN_CONT " at:\n");
- print_lock_trace(class->usage_traces + bit, len);
+ print_lock_trace(class->usage_traces[bit], len);
}
}
printk("%*s }\n", depth, "");
@@ -1884,7 +1952,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
do {
print_lock_class_header(entry->class, depth);
printk("%*s ... acquired at:\n", depth, "");
- print_lock_trace(&entry->trace, 2);
+ print_lock_trace(entry->trace, 2);
printk("\n");
if (depth == 0 && (entry != root)) {
@@ -1995,14 +2063,14 @@ print_bad_irq_dependency(struct task_struct *curr,
print_lock_name(backwards_entry->class);
pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
- print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);
+ print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
print_lock_name(forwards_entry->class);
pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
pr_warn("...");
- print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);
+ print_lock_trace(forwards_entry->class->usage_traces[bit2], 1);
pr_warn("\nother info that might help us debug this:\n\n");
print_irq_lock_scenario(backwards_entry, forwards_entry,
@@ -2011,13 +2079,15 @@ print_bad_irq_dependency(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
- if (!save_trace(&prev_root->trace))
+ prev_root->trace = save_trace();
+ if (!prev_root->trace)
return;
print_shortest_lock_dependencies(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
- if (!save_trace(&next_root->trace))
+ next_root->trace = save_trace();
+ if (!next_root->trace)
return;
print_shortest_lock_dependencies(forwards_entry, next_root);
@@ -2369,7 +2439,8 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, struct lock_trace *trace)
+ struct held_lock *next, int distance,
+ struct lock_trace **const trace)
{
struct lock_list *entry;
int ret;
@@ -2444,8 +2515,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
return ret;
#endif
- if (!trace->nr_entries && !save_trace(trace))
- return 0;
+ if (!*trace) {
+ *trace = save_trace();
+ if (!*trace)
+ return 0;
+ }
/*
* Ok, all validations passed, add the new lock
@@ -2453,14 +2527,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance, trace);
+ next->acquire_ip, distance, *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance, trace);
+ next->acquire_ip, distance, *trace);
if (!ret)
return 0;
@@ -2476,7 +2550,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
- struct lock_trace trace = { .nr_entries = 0 };
+ struct lock_trace *trace = NULL;
int depth = curr->lockdep_depth;
struct held_lock *hlock;
@@ -2969,7 +3043,7 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_PROVE_LOCKING
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit);
@@ -3015,7 +3089,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
print_lock(this);
pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]);
- print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);
+ print_lock_trace(hlock_class(this)->usage_traces[prev_bit], 1);
print_irqtrace_events(curr);
pr_warn("\nother info that might help us debug this:\n");
@@ -3096,7 +3170,8 @@ print_irq_inversion_bug(struct task_struct *curr,
lockdep_print_held_locks(curr);
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
- if (!save_trace(&root->trace))
+ root->trace = save_trace();
+ if (!root->trace)
return;
print_shortest_lock_dependencies(other, root);
@@ -3580,7 +3655,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
hlock_class(this)->usage_mask |= new_mask;
- if (!save_trace(hlock_class(this)->usage_traces + new_bit))
+ if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
return 0;
switch (new_bit) {
@@ -3608,7 +3683,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+#else /* CONFIG_PROVE_LOCKING */
static inline int
mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
@@ -3627,7 +3702,7 @@ static inline int separate_irq_context(struct task_struct *curr,
return 0;
}
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+#endif /* CONFIG_PROVE_LOCKING */
/*
* Initialize a lock instance's lock-class mapping info:
@@ -4321,8 +4396,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
*/
static void check_flags(unsigned long flags)
{
-#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
- defined(CONFIG_TRACE_IRQFLAGS)
+#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP)
if (!debug_locks)
return;
@@ -5158,6 +5232,12 @@ void __init lockdep_init(void)
) / 1024
);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ printk(" memory used for stack traces: %zu kB\n",
+ (sizeof(stack_trace) + sizeof(stack_trace_hash)) / 1024
+ );
+#endif
+
printk(" per task-struct memory footprint: %zu bytes\n",
sizeof(((struct task_struct *)NULL)->held_locks));
}
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index cc83568d5012..18d85aebbb57 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -92,6 +92,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_ENTRIES 16384UL
#define MAX_LOCKDEP_CHAINS_BITS 15
#define MAX_STACK_TRACE_ENTRIES 262144UL
+#define STACK_TRACE_HASH_SIZE 8192
#else
#define MAX_LOCKDEP_ENTRIES 32768UL
@@ -102,6 +103,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
* addresses. Protected by the hash_lock.
*/
#define MAX_STACK_TRACE_ENTRIES 524288UL
+#define STACK_TRACE_HASH_SIZE 16384
#endif
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
@@ -116,7 +118,8 @@ extern struct lock_chain lock_chains[];
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
-extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+extern const char *__get_key_name(const struct lockdep_subclass_key *key,
+ char *str);
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
@@ -137,6 +140,10 @@ extern unsigned int max_bfs_queue_depth;
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
extern unsigned long lockdep_count_backward_deps(struct lock_class *);
+#ifdef CONFIG_TRACE_IRQFLAGS
+u64 lockdep_stack_trace_count(void);
+u64 lockdep_stack_hash_count(void);
+#endif
#else
static inline unsigned long
lockdep_count_forward_deps(struct lock_class *class)
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 9c49ec645d8b..dadb7b7fba37 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -200,7 +200,6 @@ static void lockdep_stats_debug_show(struct seq_file *m)
static int lockdep_stats_show(struct seq_file *m, void *v)
{
- struct lock_class *class;
unsigned long nr_unused = 0, nr_uncategorized = 0,
nr_irq_safe = 0, nr_irq_unsafe = 0,
nr_softirq_safe = 0, nr_softirq_unsafe = 0,
@@ -210,6 +209,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
sum_forward_deps = 0;
+#ifdef CONFIG_PROVE_LOCKING
+ struct lock_class *class;
+
list_for_each_entry(class, &all_lock_classes, lock_entry) {
if (class->usage_mask == 0)
@@ -241,13 +243,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
nr_hardirq_read_unsafe++;
-#ifdef CONFIG_PROVE_LOCKING
sum_forward_deps += lockdep_count_forward_deps(class);
-#endif
}
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
+
+#endif
seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
nr_lock_classes, MAX_LOCKDEP_KEYS);
seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
@@ -283,6 +285,12 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_process_chains);
seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+ seq_printf(m, " number of stack traces: %llu\n",
+ lockdep_stack_trace_count());
+ seq_printf(m, " number of stack hash chains: %llu\n",
+ lockdep_stack_hash_count());
+#endif
seq_printf(m, " combined max dependencies: %11u\n",
(nr_hardirq_chains + 1) *
(nr_softirq_chains + 1) *
@@ -397,7 +405,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
{
- struct lockdep_subclass_key *ckey;
+ const struct lockdep_subclass_key *ckey;
struct lock_class_stats *stats;
struct lock_class *class;
const char *cname;
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0c601ae072b3..468a9b8422e3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -16,7 +16,7 @@
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
*/
#include <linux/mutex.h>
#include <linux/ww_mutex.h>
@@ -65,11 +65,37 @@ EXPORT_SYMBOL(__mutex_init);
#define MUTEX_FLAGS 0x07
+/*
+ * Internal helper function; C doesn't allow us to hide it :/
+ *
+ * DO NOT USE (outside of mutex code).
+ */
+static inline struct task_struct *__mutex_owner(struct mutex *lock)
+{
+ return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
+}
+
static inline struct task_struct *__owner_task(unsigned long owner)
{
return (struct task_struct *)(owner & ~MUTEX_FLAGS);
}
+bool mutex_is_locked(struct mutex *lock)
+{
+ return __mutex_owner(lock) != NULL;
+}
+EXPORT_SYMBOL(mutex_is_locked);
+
+__must_check enum mutex_trylock_recursive_enum
+mutex_trylock_recursive(struct mutex *lock)
+{
+ if (unlikely(__mutex_owner(lock) == current))
+ return MUTEX_TRYLOCK_RECURSIVE;
+
+ return mutex_trylock(lock);
+}
+EXPORT_SYMBOL(mutex_trylock_recursive);
+
static inline unsigned long __owner_flags(unsigned long owner)
{
return owner & MUTEX_FLAGS;
@@ -908,6 +934,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
might_sleep();
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+#endif
+
ww = container_of(lock, struct ww_mutex, base);
if (use_ww_ctx && ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
@@ -1379,8 +1409,13 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
*/
int __sched mutex_trylock(struct mutex *lock)
{
- bool locked = __mutex_trylock(lock);
+ bool locked;
+
+#ifdef CONFIG_DEBUG_MUTEXES
+ DEBUG_LOCKS_WARN_ON(lock->magic != lock);
+#endif
+ locked = __mutex_trylock(lock);
if (locked)
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 38fbf9fa7f1b..2874bf556162 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -9,7 +9,7 @@
* Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
* Copyright (C) 2006 Esben Nielsen
*
- * See Documentation/locking/rt-mutex-design.txt for details.
+ * See Documentation/locking/rt-mutex-design.rst for details.
*/
#include <linux/spinlock.h>
#include <linux/export.h>
@@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/* [10] Grab the next task, i.e. owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
+ task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
/*
@@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/* [10] Grab the next task, i.e. the owner of @lock */
- task = rt_mutex_owner(lock);
- get_task_struct(task);
+ task = get_task_struct(rt_mutex_owner(lock));
raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..eef04551eae7 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -105,8 +105,9 @@
#ifdef CONFIG_DEBUG_RWSEMS
# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
if (!debug_locks_silent && \
- WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
#c, atomic_long_read(&(sem)->count), \
+ (unsigned long) sem->magic, \
atomic_long_read(&(sem)->owner), (long)current, \
list_empty(&(sem)->wait_list) ? "" : "not ")) \
debug_locks_off(); \
@@ -330,6 +331,9 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
+#ifdef CONFIG_DEBUG_RWSEMS
+ sem->magic = sem;
+#endif
atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
@@ -666,7 +670,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
preempt_disable();
rcu_read_lock();
owner = rwsem_owner_flags(sem, &flags);
- if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner)))
+ /*
+ * Don't check the read-owner as the entry may be stale.
+ */
+ if ((flags & nonspinnable) ||
+ (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
rcu_read_unlock();
preempt_enable();
@@ -720,11 +728,12 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
rcu_read_lock();
for (;;) {
- if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
- state = OWNER_NONSPINNABLE;
- break;
- }
-
+ /*
+ * When a waiting writer set the handoff flag, it may spin
+ * on the owner as well. Once that writer acquires the lock,
+ * we can spin on it. So we don't need to quit even when the
+ * handoff bit is set.
+ */
new = rwsem_owner_flags(sem, &new_flags);
if ((new != owner) || (new_flags != flags)) {
state = rwsem_owner_state(new, new_flags, nonspinnable);
@@ -970,6 +979,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
{
return false;
}
+
+static inline int
+rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+{
+ return 0;
+}
+#define OWNER_NULL 1
#endif
/*
@@ -1000,6 +1016,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
adjustment = 0;
if (rwsem_optimistic_spin(sem, false)) {
+ /* rwsem_optimistic_spin() implies ACQUIRE on success */
/*
* Wake up other readers in the wait list if the front
* waiter is a reader.
@@ -1014,6 +1031,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
}
return sem;
} else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
+ /* rwsem_reader_phase_trylock() implies ACQUIRE on success */
return sem;
}
@@ -1032,6 +1050,8 @@ queue:
*/
if (adjustment && !(atomic_long_read(&sem->count) &
(RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ /* Provide lock ACQUIRE */
+ smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_fast);
@@ -1065,15 +1085,18 @@ queue:
wake_up_q(&wake_q);
/* wait to be given the lock */
- while (true) {
+ for (;;) {
set_current_state(state);
- if (!waiter.task)
+ if (!smp_load_acquire(&waiter.task)) {
+ /* Matches rwsem_mark_wake()'s smp_store_release(). */
break;
+ }
if (signal_pending_state(state, current)) {
raw_spin_lock_irq(&sem->wait_lock);
if (waiter.task)
goto out_nolock;
raw_spin_unlock_irq(&sem->wait_lock);
+ /* Ordered by sem->wait_lock against rwsem_mark_wake(). */
break;
}
schedule();
@@ -1083,6 +1106,7 @@ queue:
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
return sem;
+
out_nolock:
list_del(&waiter.list);
if (list_empty(&sem->wait_list)) {
@@ -1123,8 +1147,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/* do optimistic spinning and steal lock if possible */
if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
- rwsem_optimistic_spin(sem, true))
+ rwsem_optimistic_spin(sem, true)) {
+ /* rwsem_optimistic_spin() implies ACQUIRE on success */
return sem;
+ }
/*
* Disable reader optimistic spinning for this rwsem after
@@ -1184,12 +1210,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
- while (true) {
- if (rwsem_try_write_lock(sem, wstate))
+ for (;;) {
+ if (rwsem_try_write_lock(sem, wstate)) {
+ /* rwsem_try_write_lock() implies ACQUIRE on success */
break;
+ }
raw_spin_unlock_irq(&sem->wait_lock);
+ /*
+ * After setting the handoff bit and failing to acquire
+ * the lock, attempt to spin on owner to accelerate lock
+ * transfer. If the previous owner is a on-cpu writer and it
+ * has just released the lock, OWNER_NULL will be returned.
+ * In this case, we attempt to acquire the lock again
+ * without sleeping.
+ */
+ if ((wstate == WRITER_HANDOFF) &&
+ (rwsem_spin_on_owner(sem, 0) == OWNER_NULL))
+ goto trylock_again;
+
/* Block until there are no active lockers. */
for (;;) {
if (signal_pending_state(state, current))
@@ -1224,7 +1264,7 @@ wait:
break;
}
}
-
+trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
@@ -1322,11 +1362,14 @@ static inline int __down_read_killable(struct rw_semaphore *sem)
static inline int __down_read_trylock(struct rw_semaphore *sem)
{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+
/*
* Optimize for the case when the rwsem is not locked at all.
*/
- long tmp = RWSEM_UNLOCKED_VALUE;
-
+ tmp = RWSEM_UNLOCKED_VALUE;
do {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
@@ -1367,8 +1410,11 @@ static inline int __down_write_killable(struct rw_semaphore *sem)
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
- long tmp = RWSEM_UNLOCKED_VALUE;
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
+ tmp = RWSEM_UNLOCKED_VALUE;
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
@@ -1384,7 +1430,9 @@ inline void __up_read(struct rw_semaphore *sem)
{
long tmp;
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+
rwsem_clear_reader_owned(sem);
tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
@@ -1402,12 +1450,14 @@ static inline void __up_write(struct rw_semaphore *sem)
{
long tmp;
+ DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
/*
* sem->owner may differ from current if the ownership is transferred
* to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
*/
DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
!rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+
rwsem_clear_owner(sem);
tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
if (unlikely(tmp & RWSEM_FLAG_WAITERS))
diff --git a/kernel/memremap.c b/kernel/memremap.c
deleted file mode 100644
index 6e1970719dc2..000000000000
--- a/kernel/memremap.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
-#include <linux/device.h>
-#include <linux/io.h>
-#include <linux/kasan.h>
-#include <linux/memory_hotplug.h>
-#include <linux/mm.h>
-#include <linux/pfn_t.h>
-#include <linux/swap.h>
-#include <linux/swapops.h>
-#include <linux/types.h>
-#include <linux/wait_bit.h>
-#include <linux/xarray.h>
-#include <linux/hmm.h>
-
-static DEFINE_XARRAY(pgmap_array);
-#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
-#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
- unsigned long addr,
- swp_entry_t entry,
- unsigned int flags,
- pmd_t *pmdp)
-{
- struct page *page = device_private_entry_to_page(entry);
- struct hmm_devmem *devmem;
-
- devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
-
- /*
- * The page_fault() callback must migrate page back to system memory
- * so that CPU can access it. This might fail for various reasons
- * (device issue, device was unsafely unplugged, ...). When such
- * error conditions happen, the callback must return VM_FAULT_SIGBUS.
- *
- * Note that because memory cgroup charges are accounted to the device
- * memory, this should never fail because of memory restrictions (but
- * allocation of regular system page might still fail because we are
- * out of memory).
- *
- * There is a more in-depth description of what that callback can and
- * cannot do, in include/linux/memremap.h
- */
- return devmem->page_fault(vma, addr, page, flags, pmdp);
-}
-#endif /* CONFIG_DEVICE_PRIVATE */
-
-static void pgmap_array_delete(struct resource *res)
-{
- xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
- NULL, GFP_KERNEL);
- synchronize_rcu();
-}
-
-static unsigned long pfn_first(struct dev_pagemap *pgmap)
-{
- const struct resource *res = &pgmap->res;
- struct vmem_altmap *altmap = &pgmap->altmap;
- unsigned long pfn;
-
- pfn = res->start >> PAGE_SHIFT;
- if (pgmap->altmap_valid)
- pfn += vmem_altmap_offset(altmap);
- return pfn;
-}
-
-static unsigned long pfn_end(struct dev_pagemap *pgmap)
-{
- const struct resource *res = &pgmap->res;
-
- return (res->start + resource_size(res)) >> PAGE_SHIFT;
-}
-
-static unsigned long pfn_next(unsigned long pfn)
-{
- if (pfn % 1024 == 0)
- cond_resched();
- return pfn + 1;
-}
-
-#define for_each_device_pfn(pfn, map) \
- for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
-
-static void devm_memremap_pages_release(void *data)
-{
- struct dev_pagemap *pgmap = data;
- struct device *dev = pgmap->dev;
- struct resource *res = &pgmap->res;
- resource_size_t align_start, align_size;
- unsigned long pfn;
- int nid;
-
- pgmap->kill(pgmap->ref);
- for_each_device_pfn(pfn, pgmap)
- put_page(pfn_to_page(pfn));
- pgmap->cleanup(pgmap->ref);
-
- /* pages are dead and unused, undo the arch mapping */
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- - align_start;
-
- nid = page_to_nid(pfn_to_page(align_start >> PAGE_SHIFT));
-
- mem_hotplug_begin();
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- pfn = align_start >> PAGE_SHIFT;
- __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
- align_size >> PAGE_SHIFT, NULL);
- } else {
- arch_remove_memory(nid, align_start, align_size,
- pgmap->altmap_valid ? &pgmap->altmap : NULL);
- kasan_remove_zero_shadow(__va(align_start), align_size);
- }
- mem_hotplug_done();
-
- untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
- pgmap_array_delete(res);
- dev_WARN_ONCE(dev, pgmap->altmap.alloc,
- "%s: failed to free all reserved pages\n", __func__);
-}
-
-/**
- * devm_memremap_pages - remap and provide memmap backing for the given resource
- * @dev: hosting device for @res
- * @pgmap: pointer to a struct dev_pagemap
- *
- * Notes:
- * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
- * by the caller before passing it to this function
- *
- * 2/ The altmap field may optionally be initialized, in which case altmap_valid
- * must be set to true
- *
- * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
- * at devm_memremap_pages_release() time, or if this routine fails.
- *
- * 4/ res is expected to be a host memory range that could feasibly be
- * treated as a "System RAM" range, i.e. not a device mmio range, but
- * this is not enforced.
- */
-void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-{
- resource_size_t align_start, align_size, align_end;
- struct vmem_altmap *altmap = pgmap->altmap_valid ?
- &pgmap->altmap : NULL;
- struct resource *res = &pgmap->res;
- struct dev_pagemap *conflict_pgmap;
- struct mhp_restrictions restrictions = {
- /*
- * We do not want any optional features only our own memmap
- */
- .altmap = altmap,
- };
- pgprot_t pgprot = PAGE_KERNEL;
- int error, nid, is_ram;
-
- if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
- WARN(1, "Missing reference count teardown definition\n");
- return ERR_PTR(-EINVAL);
- }
-
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- - align_start;
- align_end = align_start + align_size - 1;
-
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_start), NULL);
- if (conflict_pgmap) {
- dev_WARN(dev, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
- if (conflict_pgmap) {
- dev_WARN(dev, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- is_ram = region_intersects(align_start, align_size,
- IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
-
- if (is_ram != REGION_DISJOINT) {
- WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
- is_ram == REGION_MIXED ? "mixed" : "ram", res);
- error = -ENXIO;
- goto err_array;
- }
-
- pgmap->dev = dev;
-
- error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
- PHYS_PFN(res->end), pgmap, GFP_KERNEL));
- if (error)
- goto err_array;
-
- nid = dev_to_node(dev);
- if (nid < 0)
- nid = numa_mem_id();
-
- error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
- align_size);
- if (error)
- goto err_pfn_remap;
-
- mem_hotplug_begin();
-
- /*
- * For device private memory we call add_pages() as we only need to
- * allocate and initialize struct page for the device memory. More-
- * over the device memory is un-accessible thus we do not want to
- * create a linear mapping for the memory like arch_add_memory()
- * would do.
- *
- * For all other device memory types, which are accessible by
- * the CPU, we do want the linear mapping and thus use
- * arch_add_memory().
- */
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- error = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, &restrictions);
- } else {
- error = kasan_add_zero_shadow(__va(align_start), align_size);
- if (error) {
- mem_hotplug_done();
- goto err_kasan;
- }
-
- error = arch_add_memory(nid, align_start, align_size,
- &restrictions);
- }
-
- if (!error) {
- struct zone *zone;
-
- zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
- move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, altmap);
- }
-
- mem_hotplug_done();
- if (error)
- goto err_add_memory;
-
- /*
- * Initialization of the pages has been deferred until now in order
- * to allow us to do the work while not holding the hotplug lock.
- */
- memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, pgmap);
- percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
-
- error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
- pgmap);
- if (error)
- return ERR_PTR(error);
-
- return __va(res->start);
-
- err_add_memory:
- kasan_remove_zero_shadow(__va(align_start), align_size);
- err_kasan:
- untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
- err_pfn_remap:
- pgmap_array_delete(res);
- err_array:
- pgmap->kill(pgmap->ref);
- pgmap->cleanup(pgmap->ref);
-
- return ERR_PTR(error);
-}
-EXPORT_SYMBOL_GPL(devm_memremap_pages);
-
-void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
-{
- devm_release_action(dev, devm_memremap_pages_release, pgmap);
-}
-EXPORT_SYMBOL_GPL(devm_memunmap_pages);
-
-unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
-{
- /* number of pfns from base where pfn_to_page() is valid */
- return altmap->reserve + altmap->free;
-}
-
-void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
-{
- altmap->alloc -= nr_pfns;
-}
-
-/**
- * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
- * @pfn: page frame number to lookup page_map
- * @pgmap: optional known pgmap that already has a reference
- *
- * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
- * is non-NULL but does not cover @pfn the reference to it will be released.
- */
-struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
- struct dev_pagemap *pgmap)
-{
- resource_size_t phys = PFN_PHYS(pfn);
-
- /*
- * In the cached case we're already holding a live reference.
- */
- if (pgmap) {
- if (phys >= pgmap->res.start && phys <= pgmap->res.end)
- return pgmap;
- put_dev_pagemap(pgmap);
- }
-
- /* fall back to slow path lookup */
- rcu_read_lock();
- pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
- if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
- pgmap = NULL;
- rcu_read_unlock();
-
- return pgmap;
-}
-EXPORT_SYMBOL_GPL(get_dev_pagemap);
-
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_enable;
-
-/*
- * Toggle the static key for ->page_free() callbacks when dev_pagemap
- * pages go idle.
- */
-void dev_pagemap_get_ops(void)
-{
- if (atomic_inc_return(&devmap_enable) == 1)
- static_branch_enable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
-
-void dev_pagemap_put_ops(void)
-{
- if (atomic_dec_and_test(&devmap_enable))
- static_branch_disable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
-
-void __put_devmap_managed_page(struct page *page)
-{
- int count = page_ref_dec_return(page);
-
- /*
- * If refcount is 1 then page is freed and refcount is stable as nobody
- * holds a reference on the page.
- */
- if (count == 1) {
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
- __ClearPageWaiters(page);
-
- mem_cgroup_uncharge(page);
-
- page->pgmap->page_free(page, page->pgmap->data);
- } else if (!count)
- __put_page(page);
-}
-EXPORT_SYMBOL(__put_devmap_managed_page);
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/kernel/module.c b/kernel/module.c
index a2cee14a83f3..32873bcce738 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -65,9 +65,9 @@
/*
* Modules' sections will be aligned on page boundaries
* to ensure complete separation of code and data, but
- * only when CONFIG_STRICT_MODULE_RWX=y
+ * only when CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
*/
-#ifdef CONFIG_STRICT_MODULE_RWX
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
# define debug_align(X) ALIGN(X, PAGE_SIZE)
#else
# define debug_align(X) (X)
@@ -544,12 +544,20 @@ static const char *kernel_symbol_name(const struct kernel_symbol *sym)
#endif
}
-static int cmp_name(const void *va, const void *vb)
+static const char *kernel_symbol_namespace(const struct kernel_symbol *sym)
{
- const char *a;
- const struct kernel_symbol *b;
- a = va; b = vb;
- return strcmp(a, kernel_symbol_name(b));
+#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS
+ if (!sym->namespace_offset)
+ return NULL;
+ return offset_to_ptr(&sym->namespace_offset);
+#else
+ return sym->namespace;
+#endif
+}
+
+static int cmp_name(const void *name, const void *sym)
+{
+ return strcmp(name, kernel_symbol_name(sym));
}
static bool find_exported_symbol_in_section(const struct symsearch *syms,
@@ -1379,6 +1387,41 @@ static inline int same_magic(const char *amagic, const char *bmagic,
}
#endif /* CONFIG_MODVERSIONS */
+static char *get_modinfo(const struct load_info *info, const char *tag);
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev);
+
+static int verify_namespace_is_imported(const struct load_info *info,
+ const struct kernel_symbol *sym,
+ struct module *mod)
+{
+ const char *namespace;
+ char *imported_namespace;
+
+ namespace = kernel_symbol_namespace(sym);
+ if (namespace) {
+ imported_namespace = get_modinfo(info, "import_ns");
+ while (imported_namespace) {
+ if (strcmp(namespace, imported_namespace) == 0)
+ return 0;
+ imported_namespace = get_next_modinfo(
+ info, "import_ns", imported_namespace);
+ }
+#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ pr_warn(
+#else
+ pr_err(
+#endif
+ "%s: module uses symbol (%s) from namespace %s, but does not import it.\n",
+ mod->name, kernel_symbol_name(sym), namespace);
+#ifndef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
+ return -EINVAL;
+#endif
+ }
+ return 0;
+}
+
+
/* Resolve a symbol for this module. I.e. if we find one, record usage. */
static const struct kernel_symbol *resolve_symbol(struct module *mod,
const struct load_info *info,
@@ -1407,6 +1450,12 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
goto getname;
}
+ err = verify_namespace_is_imported(info, sym, mod);
+ if (err) {
+ sym = ERR_PTR(err);
+ goto getname;
+ }
+
err = ref_module(mod, owner);
if (err) {
sym = ERR_PTR(err);
@@ -1492,8 +1541,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
for (i = 0; i < info->hdr->e_shnum; i++)
if (!sect_empty(&info->sechdrs[i]))
nloaded++;
- size[0] = ALIGN(sizeof(*sect_attrs)
- + nloaded * sizeof(sect_attrs->attrs[0]),
+ size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
sizeof(sect_attrs->grp.attrs[0]));
size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
@@ -1697,6 +1745,8 @@ static int add_usage_links(struct module *mod)
return ret;
}
+static void module_remove_modinfo_attrs(struct module *mod, int end);
+
static int module_add_modinfo_attrs(struct module *mod)
{
struct module_attribute *attr;
@@ -1711,24 +1761,34 @@ static int module_add_modinfo_attrs(struct module *mod)
return -ENOMEM;
temp_attr = mod->modinfo_attrs;
- for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
if (!attr->test || attr->test(mod)) {
memcpy(temp_attr, attr, sizeof(*temp_attr));
sysfs_attr_init(&temp_attr->attr);
error = sysfs_create_file(&mod->mkobj.kobj,
&temp_attr->attr);
+ if (error)
+ goto error_out;
++temp_attr;
}
}
+
+ return 0;
+
+error_out:
+ if (i > 0)
+ module_remove_modinfo_attrs(mod, --i);
return error;
}
-static void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod, int end)
{
struct module_attribute *attr;
int i;
for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
+ if (end >= 0 && i > end)
+ break;
/* pick a field to test for end of list */
if (!attr->attr.name)
break;
@@ -1816,7 +1876,7 @@ static int mod_sysfs_setup(struct module *mod,
return 0;
out_unreg_modinfo_attrs:
- module_remove_modinfo_attrs(mod);
+ module_remove_modinfo_attrs(mod, -1);
out_unreg_param:
module_param_sysfs_remove(mod);
out_unreg_holders:
@@ -1852,7 +1912,7 @@ static void mod_sysfs_fini(struct module *mod)
{
}
-static void module_remove_modinfo_attrs(struct module *mod)
+static void module_remove_modinfo_attrs(struct module *mod, int end)
{
}
@@ -1868,14 +1928,14 @@ static void init_param_lock(struct module *mod)
static void mod_sysfs_teardown(struct module *mod)
{
del_usage_links(mod);
- module_remove_modinfo_attrs(mod);
+ module_remove_modinfo_attrs(mod, -1);
module_param_sysfs_remove(mod);
kobject_put(mod->mkobj.drivers_dir);
kobject_put(mod->holders_dir);
mod_sysfs_fini(mod);
}
-#ifdef CONFIG_STRICT_MODULE_RWX
+#ifdef CONFIG_ARCH_HAS_STRICT_MODULE_RWX
/*
* LKM RO/NX protection: protect module's text/ro-data
* from modification and any data from execution.
@@ -1898,6 +1958,7 @@ static void frob_text(const struct module_layout *layout,
layout->text_size >> PAGE_SHIFT);
}
+#ifdef CONFIG_STRICT_MODULE_RWX
static void frob_rodata(const struct module_layout *layout,
int (*set_memory)(unsigned long start, int num_pages))
{
@@ -1949,13 +2010,9 @@ void module_enable_ro(const struct module *mod, bool after_init)
set_vm_flush_reset_perms(mod->core_layout.base);
set_vm_flush_reset_perms(mod->init_layout.base);
frob_text(&mod->core_layout, set_memory_ro);
- frob_text(&mod->core_layout, set_memory_x);
frob_rodata(&mod->core_layout, set_memory_ro);
-
frob_text(&mod->init_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_x);
-
frob_rodata(&mod->init_layout, set_memory_ro);
if (after_init)
@@ -2014,9 +2071,19 @@ void set_all_modules_text_ro(void)
}
mutex_unlock(&module_mutex);
}
-#else
+#else /* !CONFIG_STRICT_MODULE_RWX */
static void module_enable_nx(const struct module *mod) { }
-#endif
+#endif /* CONFIG_STRICT_MODULE_RWX */
+static void module_enable_x(const struct module *mod)
+{
+ frob_text(&mod->core_layout, set_memory_x);
+ frob_text(&mod->init_layout, set_memory_x);
+}
+#else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+static void module_enable_nx(const struct module *mod) { }
+static void module_enable_x(const struct module *mod) { }
+#endif /* CONFIG_ARCH_HAS_STRICT_MODULE_RWX */
+
#ifdef CONFIG_LIVEPATCH
/*
@@ -2463,7 +2530,8 @@ static char *next_string(char *string, unsigned long *secsize)
return string;
}
-static char *get_modinfo(struct load_info *info, const char *tag)
+static char *get_next_modinfo(const struct load_info *info, const char *tag,
+ char *prev)
{
char *p;
unsigned int taglen = strlen(tag);
@@ -2474,13 +2542,25 @@ static char *get_modinfo(struct load_info *info, const char *tag)
* get_modinfo() calls made before rewrite_section_headers()
* must use sh_offset, as sh_addr isn't set!
*/
- for (p = (char *)info->hdr + infosec->sh_offset; p; p = next_string(p, &size)) {
+ char *modinfo = (char *)info->hdr + infosec->sh_offset;
+
+ if (prev) {
+ size -= prev - modinfo;
+ modinfo = next_string(prev, &size);
+ }
+
+ for (p = modinfo; p; p = next_string(p, &size)) {
if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
return p + taglen + 1;
}
return NULL;
}
+static char *get_modinfo(const struct load_info *info, const char *tag)
+{
+ return get_next_modinfo(info, tag, NULL);
+}
+
static void setup_modinfo(struct module *mod, struct load_info *info)
{
struct module_attribute *attr;
@@ -2723,6 +2803,11 @@ void * __weak module_alloc(unsigned long size)
return vmalloc_exec(size);
}
+bool __weak module_exit_section(const char *name)
+{
+ return strstarts(name, ".exit");
+}
+
#ifdef CONFIG_DEBUG_KMEMLEAK
static void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
@@ -2912,7 +2997,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
#ifndef CONFIG_MODULE_UNLOAD
/* Don't load .exit sections */
- if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+ if (module_exit_section(info->secstrings+shdr->sh_name))
shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
#endif
}
@@ -3390,8 +3475,7 @@ static bool finished_loading(const char *name)
sched_annotate_sleep();
mutex_lock(&module_mutex);
mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE
- || mod->state == MODULE_STATE_GOING;
+ ret = !mod || mod->state == MODULE_STATE_LIVE;
mutex_unlock(&module_mutex);
return ret;
@@ -3581,8 +3665,7 @@ again:
mutex_lock(&module_mutex);
old = find_module_all(mod->name, strlen(mod->name), true);
if (old != NULL) {
- if (old->state == MODULE_STATE_COMING
- || old->state == MODULE_STATE_UNFORMED) {
+ if (old->state != MODULE_STATE_LIVE) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
err = wait_event_interruptible(module_wq,
@@ -3621,6 +3704,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_enable_ro(mod, false);
module_enable_nx(mod);
+ module_enable_x(mod);
/* Mark state as coming so strong_try_module_get() ignores us,
* but kallsyms etc. can see us. */
diff --git a/kernel/padata.c b/kernel/padata.c
index 2d2fddbb7a4c..c3fec1413295 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -46,18 +46,13 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
return target_cpu;
}
-static int padata_cpu_hash(struct parallel_data *pd)
+static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
{
- unsigned int seq_nr;
- int cpu_index;
-
/*
* Hash the sequence numbers to the cpus by taking
* seq_nr mod. number of cpus in use.
*/
-
- seq_nr = atomic_inc_return(&pd->seq_nr);
- cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
+ int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
return padata_index_to_cpu(pd, cpu_index);
}
@@ -94,17 +89,19 @@ static void padata_parallel_worker(struct work_struct *parallel_work)
*
* @pinst: padata instance
* @padata: object to be parallelized
- * @cb_cpu: cpu the serialization callback function will run on,
- * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
+ * @cb_cpu: pointer to the CPU that the serialization callback function should
+ * run on. If it's not in the serial cpumask of @pinst
+ * (i.e. cpumask.cbcpu), this function selects a fallback CPU and if
+ * none found, returns -EINVAL.
*
* The parallelization callback function will run with BHs off.
* Note: Every object which is parallelized by padata_do_parallel
* must be seen by padata_do_serial.
*/
int padata_do_parallel(struct padata_instance *pinst,
- struct padata_priv *padata, int cb_cpu)
+ struct padata_priv *padata, int *cb_cpu)
{
- int target_cpu, err;
+ int i, cpu, cpu_index, target_cpu, err;
struct padata_parallel_queue *queue;
struct parallel_data *pd;
@@ -116,8 +113,19 @@ int padata_do_parallel(struct padata_instance *pinst,
if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
goto out;
- if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
- goto out;
+ if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
+ if (!cpumask_weight(pd->cpumask.cbcpu))
+ goto out;
+
+ /* Select an alternate fallback CPU and notify the caller. */
+ cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu);
+
+ cpu = cpumask_first(pd->cpumask.cbcpu);
+ for (i = 0; i < cpu_index; i++)
+ cpu = cpumask_next(cpu, pd->cpumask.cbcpu);
+
+ *cb_cpu = cpu;
+ }
err = -EBUSY;
if ((pinst->flags & PADATA_RESET))
@@ -129,9 +137,10 @@ int padata_do_parallel(struct padata_instance *pinst,
err = 0;
atomic_inc(&pd->refcnt);
padata->pd = pd;
- padata->cb_cpu = cb_cpu;
+ padata->cb_cpu = *cb_cpu;
- target_cpu = padata_cpu_hash(pd);
+ padata->seq_nr = atomic_inc_return(&pd->seq_nr);
+ target_cpu = padata_cpu_hash(pd, padata->seq_nr);
padata->cpu = target_cpu;
queue = per_cpu_ptr(pd->pqueue, target_cpu);
@@ -139,7 +148,7 @@ int padata_do_parallel(struct padata_instance *pinst,
list_add_tail(&padata->list, &queue->parallel.list);
spin_unlock(&queue->parallel.lock);
- queue_work_on(target_cpu, pinst->wq, &queue->work);
+ queue_work(pinst->parallel_wq, &queue->work);
out:
rcu_read_unlock_bh();
@@ -149,63 +158,53 @@ out:
EXPORT_SYMBOL(padata_do_parallel);
/*
- * padata_get_next - Get the next object that needs serialization.
+ * padata_find_next - Find the next object that needs serialization.
*
* Return values are:
*
* A pointer to the control struct of the next object that needs
* serialization, if present in one of the percpu reorder queues.
*
- * -EINPROGRESS, if the next object that needs serialization will
+ * NULL, if the next object that needs serialization will
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
- *
- * -ENODATA, if this cpu has to do the parallel processing for
- * the next object.
*/
-static struct padata_priv *padata_get_next(struct parallel_data *pd)
+static struct padata_priv *padata_find_next(struct parallel_data *pd,
+ bool remove_object)
{
- int cpu, num_cpus;
- unsigned int next_nr, next_index;
struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
+ int cpu = pd->cpu;
- num_cpus = cpumask_weight(pd->cpumask.pcpu);
-
- /*
- * Calculate the percpu reorder queue and the sequence
- * number of the next object.
- */
- next_nr = pd->processed;
- next_index = next_nr % num_cpus;
- cpu = padata_index_to_cpu(pd, next_index);
next_queue = per_cpu_ptr(pd->pqueue, cpu);
-
reorder = &next_queue->reorder;
spin_lock(&reorder->lock);
- if (!list_empty(&reorder->list)) {
- padata = list_entry(reorder->list.next,
- struct padata_priv, list);
-
- list_del_init(&padata->list);
- atomic_dec(&pd->reorder_objects);
+ if (list_empty(&reorder->list)) {
+ spin_unlock(&reorder->lock);
+ return NULL;
+ }
- pd->processed++;
+ padata = list_entry(reorder->list.next, struct padata_priv, list);
+ /*
+ * Checks the rare case where two or more parallel jobs have hashed to
+ * the same CPU and one of the later ones finishes first.
+ */
+ if (padata->seq_nr != pd->processed) {
spin_unlock(&reorder->lock);
- goto out;
+ return NULL;
}
- spin_unlock(&reorder->lock);
- if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
- padata = ERR_PTR(-ENODATA);
- goto out;
+ if (remove_object) {
+ list_del_init(&padata->list);
+ atomic_dec(&pd->reorder_objects);
+ ++pd->processed;
+ pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);
}
- padata = ERR_PTR(-EINPROGRESS);
-out:
+ spin_unlock(&reorder->lock);
return padata;
}
@@ -215,6 +214,7 @@ static void padata_reorder(struct parallel_data *pd)
struct padata_priv *padata;
struct padata_serial_queue *squeue;
struct padata_instance *pinst = pd->pinst;
+ struct padata_parallel_queue *next_queue;
/*
* We need to ensure that only one cpu can work on dequeueing of
@@ -230,27 +230,16 @@ static void padata_reorder(struct parallel_data *pd)
return;
while (1) {
- padata = padata_get_next(pd);
+ padata = padata_find_next(pd, true);
/*
* If the next object that needs serialization is parallel
* processed by another cpu and is still on it's way to the
* cpu's reorder queue, nothing to do for now.
*/
- if (PTR_ERR(padata) == -EINPROGRESS)
+ if (!padata)
break;
- /*
- * This cpu has to do the parallel processing of the next
- * object. It's waiting in the cpu's parallelization queue,
- * so exit immediately.
- */
- if (PTR_ERR(padata) == -ENODATA) {
- del_timer(&pd->timer);
- spin_unlock_bh(&pd->lock);
- return;
- }
-
cb_cpu = padata->cb_cpu;
squeue = per_cpu_ptr(pd->squeue, cb_cpu);
@@ -258,72 +247,37 @@ static void padata_reorder(struct parallel_data *pd)
list_add_tail(&padata->list, &squeue->serial.list);
spin_unlock(&squeue->serial.lock);
- queue_work_on(cb_cpu, pinst->wq, &squeue->work);
+ queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);
}
spin_unlock_bh(&pd->lock);
/*
* The next object that needs serialization might have arrived to
- * the reorder queues in the meantime, we will be called again
- * from the timer function if no one else cares for it.
+ * the reorder queues in the meantime.
+ *
+ * Ensure reorder queue is read after pd->lock is dropped so we see
+ * new objects from another task in padata_do_serial. Pairs with
+ * smp_mb__after_atomic in padata_do_serial.
*/
- if (atomic_read(&pd->reorder_objects)
- && !(pinst->flags & PADATA_RESET))
- mod_timer(&pd->timer, jiffies + HZ);
- else
- del_timer(&pd->timer);
+ smp_mb();
- return;
+ next_queue = per_cpu_ptr(pd->pqueue, pd->cpu);
+ if (!list_empty(&next_queue->reorder.list) &&
+ padata_find_next(pd, false))
+ queue_work(pinst->serial_wq, &pd->reorder_work);
}
static void invoke_padata_reorder(struct work_struct *work)
{
- struct padata_parallel_queue *pqueue;
struct parallel_data *pd;
local_bh_disable();
- pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
- pd = pqueue->pd;
+ pd = container_of(work, struct parallel_data, reorder_work);
padata_reorder(pd);
local_bh_enable();
}
-static void padata_reorder_timer(struct timer_list *t)
-{
- struct parallel_data *pd = from_timer(pd, t, timer);
- unsigned int weight;
- int target_cpu, cpu;
-
- cpu = get_cpu();
-
- /* We don't lock pd here to not interfere with parallel processing
- * padata_reorder() calls on other CPUs. We just need any CPU out of
- * the cpumask.pcpu set. It would be nice if it's the right one but
- * it doesn't matter if we're off to the next one by using an outdated
- * pd->processed value.
- */
- weight = cpumask_weight(pd->cpumask.pcpu);
- target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
-
- /* ensure to call the reorder callback on the correct CPU */
- if (cpu != target_cpu) {
- struct padata_parallel_queue *pqueue;
- struct padata_instance *pinst;
-
- /* The timer function is serialized wrt itself -- no locking
- * needed.
- */
- pinst = pd->pinst;
- pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
- queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
- } else {
- padata_reorder(pd);
- }
-
- put_cpu();
-}
-
static void padata_serial_worker(struct work_struct *serial_work)
{
struct padata_serial_queue *squeue;
@@ -362,40 +316,28 @@ static void padata_serial_worker(struct work_struct *serial_work)
*/
void padata_do_serial(struct padata_priv *padata)
{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
- int reorder_via_wq = 0;
-
- pd = padata->pd;
-
- cpu = get_cpu();
-
- /* We need to run on the same CPU padata_do_parallel(.., padata, ..)
- * was called on -- or, at least, enqueue the padata object into the
- * correct per-cpu queue.
- */
- if (cpu != padata->cpu) {
- reorder_via_wq = 1;
- cpu = padata->cpu;
- }
-
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ struct parallel_data *pd = padata->pd;
+ struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
+ padata->cpu);
+ struct padata_priv *cur;
spin_lock(&pqueue->reorder.lock);
+ /* Sort in ascending order of sequence number. */
+ list_for_each_entry_reverse(cur, &pqueue->reorder.list, list)
+ if (cur->seq_nr < padata->seq_nr)
+ break;
+ list_add(&padata->list, &cur->list);
atomic_inc(&pd->reorder_objects);
- list_add_tail(&padata->list, &pqueue->reorder.list);
spin_unlock(&pqueue->reorder.lock);
- put_cpu();
-
- /* If we're running on the wrong CPU, call padata_reorder() via a
- * kernel worker.
+ /*
+ * Ensure the addition to the reorder list is ordered correctly
+ * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
+ * in padata_reorder.
*/
- if (reorder_via_wq)
- queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
- else
- padata_reorder(pd);
+ smp_mb__after_atomic();
+
+ padata_reorder(pd);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -403,17 +345,36 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
const struct cpumask *pcpumask,
const struct cpumask *cbcpumask)
{
- if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
- return -ENOMEM;
+ struct workqueue_attrs *attrs;
+ int err = -ENOMEM;
+ if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
+ goto out;
cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
- if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
- free_cpumask_var(pd->cpumask.pcpu);
- return -ENOMEM;
- }
+ if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL))
+ goto free_pcpu_mask;
cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_online_mask);
+
+ attrs = alloc_workqueue_attrs();
+ if (!attrs)
+ goto free_cbcpu_mask;
+
+ /* Restrict parallel_wq workers to pd->cpumask.pcpu. */
+ cpumask_copy(attrs->cpumask, pd->cpumask.pcpu);
+ err = apply_workqueue_attrs(pd->pinst->parallel_wq, attrs);
+ free_workqueue_attrs(attrs);
+ if (err < 0)
+ goto free_cbcpu_mask;
+
return 0;
+
+free_cbcpu_mask:
+ free_cpumask_var(pd->cpumask.cbcpu);
+free_pcpu_mask:
+ free_cpumask_var(pd->cpumask.pcpu);
+out:
+ return err;
}
static void __padata_list_init(struct padata_list *pd_list)
@@ -439,26 +400,15 @@ static void padata_init_squeues(struct parallel_data *pd)
/* Initialize all percpu queues used by parallel workers */
static void padata_init_pqueues(struct parallel_data *pd)
{
- int cpu_index, cpu;
+ int cpu;
struct padata_parallel_queue *pqueue;
- cpu_index = 0;
- for_each_possible_cpu(cpu) {
+ for_each_cpu(cpu, pd->cpumask.pcpu) {
pqueue = per_cpu_ptr(pd->pqueue, cpu);
- if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
- pqueue->cpu_index = -1;
- continue;
- }
-
- pqueue->pd = pd;
- pqueue->cpu_index = cpu_index;
- cpu_index++;
-
__padata_list_init(&pqueue->reorder);
__padata_list_init(&pqueue->parallel);
INIT_WORK(&pqueue->work, padata_parallel_worker);
- INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
atomic_set(&pqueue->num_obj, 0);
}
}
@@ -481,17 +431,19 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
pd->squeue = alloc_percpu(struct padata_serial_queue);
if (!pd->squeue)
goto err_free_pqueue;
+
+ pd->pinst = pinst;
if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
goto err_free_squeue;
padata_init_pqueues(pd);
padata_init_squeues(pd);
- timer_setup(&pd->timer, padata_reorder_timer, 0);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 0);
- pd->pinst = pinst;
spin_lock_init(&pd->lock);
+ pd->cpu = cpumask_first(pd->cpumask.pcpu);
+ INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
@@ -526,8 +478,6 @@ static void padata_flush_queues(struct parallel_data *pd)
flush_work(&pqueue->work);
}
- del_timer_sync(&pd->timer);
-
if (atomic_read(&pd->reorder_objects))
padata_reorder(pd);
@@ -871,6 +821,8 @@ static void __padata_free(struct padata_instance *pinst)
padata_free_pd(pinst->pd);
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
+ destroy_workqueue(pinst->serial_wq);
+ destroy_workqueue(pinst->parallel_wq);
kfree(pinst);
}
@@ -1004,13 +956,11 @@ static struct kobj_type padata_attr_type = {
* padata_alloc - allocate and initialize a padata instance and specify
* cpumasks for serial and parallel workers.
*
- * @wq: workqueue to use for the allocated padata instance
+ * @name: used to identify the instance
* @pcpumask: cpumask that will be used for padata parallelization
* @cbcpumask: cpumask that will be used for padata serialization
- *
- * Must be called from a cpus_read_lock() protected region
*/
-static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
+static struct padata_instance *padata_alloc(const char *name,
const struct cpumask *pcpumask,
const struct cpumask *cbcpumask)
{
@@ -1021,11 +971,23 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
if (!pinst)
goto err;
- if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+ pinst->parallel_wq = alloc_workqueue("%s_parallel", WQ_UNBOUND, 0,
+ name);
+ if (!pinst->parallel_wq)
goto err_free_inst;
+
+ get_online_cpus();
+
+ pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM |
+ WQ_CPU_INTENSIVE, 1, name);
+ if (!pinst->serial_wq)
+ goto err_put_cpus;
+
+ if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
+ goto err_free_serial_wq;
if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
free_cpumask_var(pinst->cpumask.pcpu);
- goto err_free_inst;
+ goto err_free_serial_wq;
}
if (!padata_validate_cpumask(pinst, pcpumask) ||
!padata_validate_cpumask(pinst, cbcpumask))
@@ -1037,8 +999,6 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
rcu_assign_pointer(pinst->pd, pd);
- pinst->wq = wq;
-
cpumask_copy(pinst->cpumask.pcpu, pcpumask);
cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
@@ -1051,11 +1011,19 @@ static struct padata_instance *padata_alloc(struct workqueue_struct *wq,
#ifdef CONFIG_HOTPLUG_CPU
cpuhp_state_add_instance_nocalls_cpuslocked(hp_online, &pinst->node);
#endif
+
+ put_online_cpus();
+
return pinst;
err_free_masks:
free_cpumask_var(pinst->cpumask.pcpu);
free_cpumask_var(pinst->cpumask.cbcpu);
+err_free_serial_wq:
+ destroy_workqueue(pinst->serial_wq);
+err_put_cpus:
+ put_online_cpus();
+ destroy_workqueue(pinst->parallel_wq);
err_free_inst:
kfree(pinst);
err:
@@ -1067,14 +1035,11 @@ err:
* Use the cpu_possible_mask for serial and
* parallel workers.
*
- * @wq: workqueue to use for the allocated padata instance
- *
- * Must be called from a cpus_read_lock() protected region
+ * @name: used to identify the instance
*/
-struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
+struct padata_instance *padata_alloc_possible(const char *name)
{
- lockdep_assert_cpus_held();
- return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
+ return padata_alloc(name, cpu_possible_mask, cpu_possible_mask);
}
EXPORT_SYMBOL(padata_alloc_possible);
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d9f55bf7d38..057540b6eee9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -372,7 +372,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
/**
* print_tainted - return a string to represent the kernel taint state.
*
- * For individual taint flag meanings, see Documentation/sysctl/kernel.txt
+ * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst
*
* The string is overwritten by the next call to print_tainted(),
* but is always NULL terminated.
diff --git a/kernel/pid.c b/kernel/pid.c
index 16263b526560..0a9f2e437217 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,14 +37,14 @@
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
-#include <linux/proc_fs.h>
+#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
struct pid init_struct_pid = {
- .count = ATOMIC_INIT(1),
+ .count = REFCOUNT_INIT(1),
.tasks = {
{ .first = NULL },
{ .first = NULL },
@@ -108,8 +108,7 @@ void put_pid(struct pid *pid)
return;
ns = pid->numbers[pid->level].ns;
- if ((atomic_read(&pid->count) == 1) ||
- atomic_dec_and_test(&pid->count)) {
+ if (refcount_dec_and_test(&pid->count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -212,7 +211,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
get_pid_ns(ns);
- atomic_set(&pid->count, 1);
+ refcount_set(&pid->count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d726cef241c..a6a79f85c81a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -291,14 +291,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
}
extern int pid_max;
-static int zero = 0;
static struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &pid_max,
},
{ }
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ff8592ddedee..d3667b4075c1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -66,7 +66,7 @@ config HIBERNATION
need to run mkswap against the swap partition used for the suspend.
It also works with swap files to a limited extent (for details see
- <file:Documentation/power/swsusp-and-swap-files.txt>).
+ <file:Documentation/power/swsusp-and-swap-files.rst>).
Right now you may boot without resuming and resume later but in the
meantime you cannot use the swap partition(s)/file(s) involved in
@@ -75,7 +75,7 @@ config HIBERNATION
MOUNT any journaled filesystems mounted before the suspend or they
will get corrupted in a nasty way.
- For more information take a look at <file:Documentation/power/swsusp.txt>.
+ For more information take a look at <file:Documentation/power/swsusp.rst>.
config ARCH_SAVE_PAGE_KEYS
bool
@@ -256,7 +256,7 @@ config APM_EMULATION
notification of APM "events" (e.g. battery status change).
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/power/apm-acpi.txt>
+ and more information, read <file:Documentation/power/apm-acpi.rst>
and the Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index 41e83a779e19..9af5a50d3489 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -116,7 +116,7 @@ int pm_autosleep_set_state(suspend_state_t state)
int __init pm_autosleep_init(void)
{
- autosleep_ws = wakeup_source_register("autosleep");
+ autosleep_ws = wakeup_source_register(NULL, "autosleep");
if (!autosleep_ws)
return -ENOMEM;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index bdbd605c4215..e8710d179b35 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -254,7 +254,6 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_test);
#endif /* CONFIG_PM_SLEEP_DEBUG */
-#ifdef CONFIG_DEBUG_FS
static char *suspend_step_name(enum suspend_stat_step step)
{
switch (step) {
@@ -275,6 +274,92 @@ static char *suspend_step_name(enum suspend_stat_step step)
}
}
+#define suspend_attr(_name) \
+static ssize_t _name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+{ \
+ return sprintf(buf, "%d\n", suspend_stats._name); \
+} \
+static struct kobj_attribute _name = __ATTR_RO(_name)
+
+suspend_attr(success);
+suspend_attr(fail);
+suspend_attr(failed_freeze);
+suspend_attr(failed_prepare);
+suspend_attr(failed_suspend);
+suspend_attr(failed_suspend_late);
+suspend_attr(failed_suspend_noirq);
+suspend_attr(failed_resume);
+suspend_attr(failed_resume_early);
+suspend_attr(failed_resume_noirq);
+
+static ssize_t last_failed_dev_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int index;
+ char *last_failed_dev = NULL;
+
+ index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ last_failed_dev = suspend_stats.failed_devs[index];
+
+ return sprintf(buf, "%s\n", last_failed_dev);
+}
+static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev);
+
+static ssize_t last_failed_errno_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int index;
+ int last_failed_errno;
+
+ index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ last_failed_errno = suspend_stats.errno[index];
+
+ return sprintf(buf, "%d\n", last_failed_errno);
+}
+static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno);
+
+static ssize_t last_failed_step_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int index;
+ enum suspend_stat_step step;
+ char *last_failed_step = NULL;
+
+ index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
+ index %= REC_FAILED_NUM;
+ step = suspend_stats.failed_steps[index];
+ last_failed_step = suspend_step_name(step);
+
+ return sprintf(buf, "%s\n", last_failed_step);
+}
+static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step);
+
+static struct attribute *suspend_attrs[] = {
+ &success.attr,
+ &fail.attr,
+ &failed_freeze.attr,
+ &failed_prepare.attr,
+ &failed_suspend.attr,
+ &failed_suspend_late.attr,
+ &failed_suspend_noirq.attr,
+ &failed_resume.attr,
+ &failed_resume_early.attr,
+ &failed_resume_noirq.attr,
+ &last_failed_dev.attr,
+ &last_failed_errno.attr,
+ &last_failed_step.attr,
+ NULL,
+};
+
+static struct attribute_group suspend_attr_group = {
+ .name = "suspend_stats",
+ .attrs = suspend_attrs,
+};
+
+#ifdef CONFIG_DEBUG_FS
static int suspend_stats_show(struct seq_file *s, void *unused)
{
int i, index, last_dev, last_errno, last_step;
@@ -495,7 +580,7 @@ static suspend_state_t decode_state(const char *buf, size_t n)
len = p ? p - buf : n;
/* Check hibernation first. */
- if (len == 4 && !strncmp(buf, "disk", len))
+ if (len == 4 && str_has_prefix(buf, "disk"))
return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
@@ -794,6 +879,14 @@ static const struct attribute_group attr_group = {
.attrs = g,
};
+static const struct attribute_group *attr_groups[] = {
+ &attr_group,
+#ifdef CONFIG_PM_SLEEP
+ &suspend_attr_group,
+#endif
+ NULL,
+};
+
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
@@ -815,7 +908,7 @@ static int __init pm_init(void)
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
- error = sysfs_create_group(power_kobj, &attr_group);
+ error = sysfs_create_groups(power_kobj, attr_groups);
if (error)
return error;
pm_print_times_init();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 33e3febaba53..9568a2fe7c11 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -78,57 +78,9 @@ static struct pm_qos_object cpu_dma_pm_qos = {
.name = "cpu_dma_latency",
};
-static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_constraints network_lat_constraints = {
- .list = PLIST_HEAD_INIT(network_lat_constraints.list),
- .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .type = PM_QOS_MIN,
- .notifiers = &network_lat_notifier,
-};
-static struct pm_qos_object network_lat_pm_qos = {
- .constraints = &network_lat_constraints,
- .name = "network_latency",
-};
-
-
-static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_constraints network_tput_constraints = {
- .list = PLIST_HEAD_INIT(network_tput_constraints.list),
- .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
- .type = PM_QOS_MAX,
- .notifiers = &network_throughput_notifier,
-};
-static struct pm_qos_object network_throughput_pm_qos = {
- .constraints = &network_tput_constraints,
- .name = "network_throughput",
-};
-
-
-static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
-static struct pm_qos_constraints memory_bw_constraints = {
- .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
- .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
- .type = PM_QOS_SUM,
- .notifiers = &memory_bandwidth_notifier,
-};
-static struct pm_qos_object memory_bandwidth_pm_qos = {
- .constraints = &memory_bw_constraints,
- .name = "memory_bandwidth",
-};
-
-
static struct pm_qos_object *pm_qos_array[] = {
&null_pm_qos,
&cpu_dma_pm_qos,
- &network_lat_pm_qos,
- &network_throughput_pm_qos,
- &memory_bandwidth_pm_qos,
};
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c874a7026e24..f3b7239f1892 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -121,43 +121,25 @@ static void s2idle_loop(void)
{
pm_pr_dbg("suspend-to-idle\n");
+ /*
+ * Suspend-to-idle equals:
+ * frozen processes + suspended devices + idle processors.
+ * Thus s2idle_enter() should be called right after all devices have
+ * been suspended.
+ *
+ * Wakeups during the noirq suspend of devices may be spurious, so try
+ * to avoid them upfront.
+ */
for (;;) {
- int error;
-
- dpm_noirq_begin();
-
- /*
- * Suspend-to-idle equals
- * frozen processes + suspended devices + idle processors.
- * Thus s2idle_enter() should be called right after
- * all devices have been suspended.
- *
- * Wakeups during the noirq suspend of devices may be spurious,
- * so prevent them from terminating the loop right away.
- */
- error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
- if (!error)
- s2idle_enter();
- else if (error == -EBUSY && pm_wakeup_pending())
- error = 0;
-
- if (!error && s2idle_ops && s2idle_ops->wake)
+ if (s2idle_ops && s2idle_ops->wake)
s2idle_ops->wake();
- dpm_noirq_resume_devices(PMSG_RESUME);
-
- dpm_noirq_end();
-
- if (error)
- break;
-
- if (s2idle_ops && s2idle_ops->sync)
- s2idle_ops->sync();
-
if (pm_wakeup_pending())
break;
pm_wakeup_clear(false);
+
+ s2idle_enter();
}
pm_pr_dbg("resume from suspend-to-idle\n");
@@ -271,14 +253,21 @@ static int platform_suspend_prepare_late(suspend_state_t state)
static int platform_suspend_prepare_noirq(suspend_state_t state)
{
- return state != PM_SUSPEND_TO_IDLE && suspend_ops->prepare_late ?
- suspend_ops->prepare_late() : 0;
+ if (state == PM_SUSPEND_TO_IDLE)
+ return s2idle_ops && s2idle_ops->prepare_late ?
+ s2idle_ops->prepare_late() : 0;
+
+ return suspend_ops->prepare_late ? suspend_ops->prepare_late() : 0;
}
static void platform_resume_noirq(suspend_state_t state)
{
- if (state != PM_SUSPEND_TO_IDLE && suspend_ops->wake)
+ if (state == PM_SUSPEND_TO_IDLE) {
+ if (s2idle_ops && s2idle_ops->restore_early)
+ s2idle_ops->restore_early();
+ } else if (suspend_ops->wake) {
suspend_ops->wake();
+ }
}
static void platform_resume_early(suspend_state_t state)
@@ -415,11 +404,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (error)
goto Devices_early_resume;
- if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) {
- s2idle_loop();
- goto Platform_early_resume;
- }
-
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
pr_err("noirq suspend of devices failed\n");
@@ -432,6 +416,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
+ if (state == PM_SUSPEND_TO_IDLE) {
+ s2idle_loop();
+ goto Platform_wake;
+ }
+
error = suspend_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 4210152e56f0..105df4dfc783 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -27,7 +27,7 @@ static DEFINE_MUTEX(wakelocks_lock);
struct wakelock {
char *name;
struct rb_node node;
- struct wakeup_source ws;
+ struct wakeup_source *ws;
#ifdef CONFIG_PM_WAKELOCKS_GC
struct list_head lru;
#endif
@@ -46,7 +46,7 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
wl = rb_entry(node, struct wakelock, node);
- if (wl->ws.active == show_active)
+ if (wl->ws->active == show_active)
str += scnprintf(str, end - str, "%s ", wl->name);
}
if (str > buf)
@@ -112,16 +112,16 @@ static void __wakelocks_gc(struct work_struct *work)
u64 idle_time_ns;
bool active;
- spin_lock_irq(&wl->ws.lock);
- idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
- active = wl->ws.active;
- spin_unlock_irq(&wl->ws.lock);
+ spin_lock_irq(&wl->ws->lock);
+ idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws->last_time));
+ active = wl->ws->active;
+ spin_unlock_irq(&wl->ws->lock);
if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
break;
if (!active) {
- wakeup_source_remove(&wl->ws);
+ wakeup_source_unregister(wl->ws);
rb_erase(&wl->node, &wakelocks_tree);
list_del(&wl->lru);
kfree(wl->name);
@@ -187,9 +187,15 @@ static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
kfree(wl);
return ERR_PTR(-ENOMEM);
}
- wl->ws.name = wl->name;
- wl->ws.last_time = ktime_get();
- wakeup_source_add(&wl->ws);
+
+ wl->ws = wakeup_source_register(NULL, wl->name);
+ if (!wl->ws) {
+ kfree(wl->name);
+ kfree(wl);
+ return ERR_PTR(-ENOMEM);
+ }
+ wl->ws->last_time = ktime_get();
+
rb_link_node(&wl->node, parent, node);
rb_insert_color(&wl->node, &wakelocks_tree);
wakelocks_lru_add(wl);
@@ -233,9 +239,9 @@ int pm_wake_lock(const char *buf)
u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
do_div(timeout_ms, NSEC_PER_MSEC);
- __pm_wakeup_event(&wl->ws, timeout_ms);
+ __pm_wakeup_event(wl->ws, timeout_ms);
} else {
- __pm_stay_awake(&wl->ws);
+ __pm_stay_awake(wl->ws);
}
wakelocks_lru_most_recent(wl);
@@ -271,7 +277,7 @@ int pm_wake_unlock(const char *buf)
ret = PTR_ERR(wl);
goto out;
}
- __pm_relax(&wl->ws);
+ __pm_relax(wl->ws);
wakelocks_lru_most_recent(wl);
wakelocks_gc();
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 1d21ebacfdb8..17a9591e54ff 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -11,11 +11,18 @@
int _braille_console_setup(char **str, char **brl_options)
{
- if (!strncmp(*str, "brl,", 4)) {
+ size_t len;
+
+ len = str_has_prefix(*str, "brl,");
+ if (len) {
*brl_options = "";
- *str += 4;
- } else if (!strncmp(*str, "brl=", 4)) {
- *brl_options = *str + 4;
+ *str += len;
+ return 0;
+ }
+
+ len = str_has_prefix(*str, "brl=");
+ if (len) {
+ *brl_options = *str + len;
*str = strchr(*brl_options, ',');
if (!*str) {
pr_err("need port name after brl=\n");
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1888f6a3b694..ca65327a6de8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -118,19 +118,29 @@ static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
static int __control_devkmsg(char *str)
{
+ size_t len;
+
if (!str)
return -EINVAL;
- if (!strncmp(str, "on", 2)) {
+ len = str_has_prefix(str, "on");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_ON;
- return 2;
- } else if (!strncmp(str, "off", 3)) {
+ return len;
+ }
+
+ len = str_has_prefix(str, "off");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_OFF;
- return 3;
- } else if (!strncmp(str, "ratelimit", 9)) {
+ return len;
+ }
+
+ len = str_has_prefix(str, "ratelimit");
+ if (len) {
devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
- return 9;
+ return len;
}
+
return -EINVAL;
}
@@ -3274,7 +3284,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* move first record forward until length fits into the buffer */
seq = dumper->cur_seq;
idx = dumper->cur_idx;
- while (l > size && seq < dumper->next_seq) {
+ while (l >= size && seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
l -= msg_print_text(msg, true, time, NULL, 0);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 83a531cea2f3..cb9ddcc08119 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,8 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
+#include <asm/syscall.h> /* for syscall_get_* */
+
/*
* Access another process' address space via ptrace.
* Source/target buffer must be kernel space,
@@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
* to ensure no machine forgets it.
*/
EXPORT_SYMBOL_GPL(task_user_regset_view);
-#endif
+
+static unsigned long
+ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ unsigned long args[ARRAY_SIZE(info->entry.args)];
+ int i;
+
+ info->op = PTRACE_SYSCALL_INFO_ENTRY;
+ info->entry.nr = syscall_get_nr(child, regs);
+ syscall_get_arguments(child, regs, args);
+ for (i = 0; i < ARRAY_SIZE(args); i++)
+ info->entry.args[i] = args[i];
+
+ /* args is the last field in struct ptrace_syscall_info.entry */
+ return offsetofend(struct ptrace_syscall_info, entry.args);
+}
+
+static unsigned long
+ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ /*
+ * As struct ptrace_syscall_info.entry is currently a subset
+ * of struct ptrace_syscall_info.seccomp, it makes sense to
+ * initialize that subset using ptrace_get_syscall_info_entry().
+ * This can be reconsidered in the future if these structures
+ * diverge significantly enough.
+ */
+ ptrace_get_syscall_info_entry(child, regs, info);
+ info->op = PTRACE_SYSCALL_INFO_SECCOMP;
+ info->seccomp.ret_data = child->ptrace_message;
+
+ /* ret_data is the last field in struct ptrace_syscall_info.seccomp */
+ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
+}
+
+static unsigned long
+ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+ struct ptrace_syscall_info *info)
+{
+ info->op = PTRACE_SYSCALL_INFO_EXIT;
+ info->exit.rval = syscall_get_error(child, regs);
+ info->exit.is_error = !!info->exit.rval;
+ if (!info->exit.is_error)
+ info->exit.rval = syscall_get_return_value(child, regs);
+
+ /* is_error is the last field in struct ptrace_syscall_info.exit */
+ return offsetofend(struct ptrace_syscall_info, exit.is_error);
+}
+
+static int
+ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
+ void __user *datavp)
+{
+ struct pt_regs *regs = task_pt_regs(child);
+ struct ptrace_syscall_info info = {
+ .op = PTRACE_SYSCALL_INFO_NONE,
+ .arch = syscall_get_arch(child),
+ .instruction_pointer = instruction_pointer(regs),
+ .stack_pointer = user_stack_pointer(regs),
+ };
+ unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
+ unsigned long write_size;
+
+ /*
+ * This does not need lock_task_sighand() to access
+ * child->last_siginfo because ptrace_freeze_traced()
+ * called earlier by ptrace_check_attach() ensures that
+ * the tracee cannot go away and clear its last_siginfo.
+ */
+ switch (child->last_siginfo ? child->last_siginfo->si_code : 0) {
+ case SIGTRAP | 0x80:
+ switch (child->ptrace_message) {
+ case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+ actual_size = ptrace_get_syscall_info_entry(child, regs,
+ &info);
+ break;
+ case PTRACE_EVENTMSG_SYSCALL_EXIT:
+ actual_size = ptrace_get_syscall_info_exit(child, regs,
+ &info);
+ break;
+ }
+ break;
+ case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
+ actual_size = ptrace_get_syscall_info_seccomp(child, regs,
+ &info);
+ break;
+ }
+
+ write_size = min(actual_size, user_size);
+ return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
int ptrace_request(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
@@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request,
ret = __put_user(kiov.iov_len, &uiov->iov_len);
break;
}
+
+ case PTRACE_GET_SYSCALL_INFO:
+ ret = ptrace_get_syscall_info(child, addr, datavp);
+ break;
#endif
case PTRACE_SECCOMP_GET_FILTER:
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 480edf328b51..7644eda17d62 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
config TREE_RCU
bool
- default y if !PREEMPT && SMP
+ default y if !PREEMPTION && SMP
help
This option selects the RCU implementation that is
designed for very large SMP system with hundreds or
@@ -16,7 +16,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPT
+ default y if PREEMPTION
help
This option selects the RCU implementation that is
designed for very large SMP systems with hundreds or
@@ -28,7 +28,7 @@ config PREEMPT_RCU
config TINY_RCU
bool
- default y if !PREEMPT && !SMP
+ default y if !PREEMPTION && !SMP
help
This option selects the RCU implementation that is
designed for UP systems from which real-time response
@@ -70,7 +70,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU
- def_bool PREEMPT
+ def_bool PREEMPTION
select SRCU
help
This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 5ec3ea4028e2..4aa02eee8f6c 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,6 +8,17 @@ menu "RCU Debugging"
config PROVE_RCU
def_bool PROVE_LOCKING
+config PROVE_RCU_LIST
+ bool "RCU list lockdep debugging"
+ depends on PROVE_RCU && RCU_EXPERT
+ default n
+ help
+ Enable RCU lockdep checking for list usages. By default it is
+ turned off since there are several list RCU users that still
+ need to be converted to pass a lockdep expression. To prevent
+ false-positive splats, we keep it default disabled but once all
+ users are converted, we can remove this config option.
+
config TORTURE_TEST
tristate
default n
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 5290b01de534..8fd4f82c9b3d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -227,6 +227,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
#ifdef CONFIG_RCU_STALL_COMMON
+extern int rcu_cpu_stall_ftrace_dump;
extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 9bd5f6023c21..495c58ce1640 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -24,6 +24,49 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
}
/*
+ * Enqueue an rcu_head structure onto the specified callback list.
+ * This function assumes that the callback is non-lazy because it
+ * is intended for use by no-CBs CPUs, which do not distinguish
+ * between lazy and non-lazy RCU callbacks.
+ */
+void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
+{
+ *rclp->tail = rhp;
+ rclp->tail = &rhp->next;
+ WRITE_ONCE(rclp->len, rclp->len + 1);
+}
+
+/*
+ * Flush the second rcu_cblist structure onto the first one, obliterating
+ * any contents of the first. If rhp is non-NULL, enqueue it as the sole
+ * element of the second rcu_cblist structure, but ensuring that the second
+ * rcu_cblist structure, if initially non-empty, always appears non-empty
+ * throughout the process. If rdp is NULL, the second rcu_cblist structure
+ * is instead initialized to empty.
+ */
+void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
+ struct rcu_cblist *srclp,
+ struct rcu_head *rhp)
+{
+ drclp->head = srclp->head;
+ if (drclp->head)
+ drclp->tail = srclp->tail;
+ else
+ drclp->tail = &drclp->head;
+ drclp->len = srclp->len;
+ drclp->len_lazy = srclp->len_lazy;
+ if (!rhp) {
+ rcu_cblist_init(srclp);
+ } else {
+ rhp->next = NULL;
+ srclp->head = rhp;
+ srclp->tail = &rhp->next;
+ WRITE_ONCE(srclp->len, 1);
+ srclp->len_lazy = 0;
+ }
+}
+
+/*
* Dequeue the oldest rcu_head structure from the specified callback
* list. This function assumes that the callback is non-lazy, but
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -44,6 +87,67 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp)
return rhp;
}
+/* Set the length of an rcu_segcblist structure. */
+void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ atomic_long_set(&rsclp->len, v);
+#else
+ WRITE_ONCE(rsclp->len, v);
+#endif
+}
+
+/*
+ * Increase the numeric length of an rcu_segcblist structure by the
+ * specified amount, which can be negative. This can cause the ->len
+ * field to disagree with the actual number of callbacks on the structure.
+ * This increase is fully ordered with respect to the callers accesses
+ * both before and after.
+ */
+void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ smp_mb__before_atomic(); /* Up to the caller! */
+ atomic_long_add(v, &rsclp->len);
+ smp_mb__after_atomic(); /* Up to the caller! */
+#else
+ smp_mb(); /* Up to the caller! */
+ WRITE_ONCE(rsclp->len, rsclp->len + v);
+ smp_mb(); /* Up to the caller! */
+#endif
+}
+
+/*
+ * Increase the numeric length of an rcu_segcblist structure by one.
+ * This can cause the ->len field to disagree with the actual number of
+ * callbacks on the structure. This increase is fully ordered with respect
+ * to the callers accesses both before and after.
+ */
+void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp)
+{
+ rcu_segcblist_add_len(rsclp, 1);
+}
+
+/*
+ * Exchange the numeric length of the specified rcu_segcblist structure
+ * with the specified value. This can cause the ->len field to disagree
+ * with the actual number of callbacks on the structure. This exchange is
+ * fully ordered with respect to the callers accesses both before and after.
+ */
+long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v)
+{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return atomic_long_xchg(&rsclp->len, v);
+#else
+ long ret = rsclp->len;
+
+ smp_mb(); /* Up to the caller! */
+ WRITE_ONCE(rsclp->len, v);
+ smp_mb(); /* Up to the caller! */
+ return ret;
+#endif
+}
+
/*
* Initialize an rcu_segcblist structure.
*/
@@ -56,8 +160,9 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp)
rsclp->head = NULL;
for (i = 0; i < RCU_CBLIST_NSEGS; i++)
rsclp->tails[i] = &rsclp->head;
- rsclp->len = 0;
+ rcu_segcblist_set_len(rsclp, 0);
rsclp->len_lazy = 0;
+ rsclp->enabled = 1;
}
/*
@@ -69,7 +174,16 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
WARN_ON_ONCE(!rcu_segcblist_empty(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp));
WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp));
- rsclp->tails[RCU_NEXT_TAIL] = NULL;
+ rsclp->enabled = 0;
+}
+
+/*
+ * Mark the specified rcu_segcblist structure as offloaded. This
+ * structure must be empty.
+ */
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
+{
+ rsclp->offloaded = 1;
}
/*
@@ -118,6 +232,18 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
}
/*
+ * Return false if there are no CBs awaiting grace periods, otherwise,
+ * return true and store the nearest waited-upon grace period into *lp.
+ */
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp)
+{
+ if (!rcu_segcblist_pend_cbs(rsclp))
+ return false;
+ *lp = rsclp->gp_seq[RCU_WAIT_TAIL];
+ return true;
+}
+
+/*
* Enqueue the specified callback onto the specified rcu_segcblist
* structure, updating accounting as needed. Note that the ->len
* field may be accessed locklessly, hence the WRITE_ONCE().
@@ -129,13 +255,13 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy)
{
- WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
+ rcu_segcblist_inc_len(rsclp);
if (lazy)
rsclp->len_lazy++;
smp_mb(); /* Ensure counts are updated before callback is enqueued. */
rhp->next = NULL;
- *rsclp->tails[RCU_NEXT_TAIL] = rhp;
- rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
+ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp);
+ WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next);
}
/*
@@ -155,7 +281,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
if (rcu_segcblist_n_cbs(rsclp) == 0)
return false;
- WRITE_ONCE(rsclp->len, rsclp->len + 1);
+ rcu_segcblist_inc_len(rsclp);
if (lazy)
rsclp->len_lazy++;
smp_mb(); /* Ensure counts are updated before callback is entrained. */
@@ -163,9 +289,9 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--)
if (rsclp->tails[i] != rsclp->tails[i - 1])
break;
- *rsclp->tails[i] = rhp;
+ WRITE_ONCE(*rsclp->tails[i], rhp);
for (; i <= RCU_NEXT_TAIL; i++)
- rsclp->tails[i] = &rhp->next;
+ WRITE_ONCE(rsclp->tails[i], &rhp->next);
return true;
}
@@ -182,9 +308,8 @@ void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
rclp->len_lazy += rsclp->len_lazy;
- rclp->len += rsclp->len;
rsclp->len_lazy = 0;
- WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */
+ rclp->len = rcu_segcblist_xchg_len(rsclp, 0);
}
/*
@@ -200,12 +325,12 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp,
if (!rcu_segcblist_ready_cbs(rsclp))
return; /* Nothing to do. */
*rclp->tail = rsclp->head;
- rsclp->head = *rsclp->tails[RCU_DONE_TAIL];
- *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]);
+ WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
rclp->tail = rsclp->tails[RCU_DONE_TAIL];
for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--)
if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL])
- rsclp->tails[i] = &rsclp->head;
+ WRITE_ONCE(rsclp->tails[i], &rsclp->head);
}
/*
@@ -224,9 +349,9 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp,
return; /* Nothing to do. */
*rclp->tail = *rsclp->tails[RCU_DONE_TAIL];
rclp->tail = rsclp->tails[RCU_NEXT_TAIL];
- *rsclp->tails[RCU_DONE_TAIL] = NULL;
+ WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL);
for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++)
- rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL];
+ WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]);
}
/*
@@ -237,8 +362,7 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp,
struct rcu_cblist *rclp)
{
rsclp->len_lazy += rclp->len_lazy;
- /* ->len sampled locklessly. */
- WRITE_ONCE(rsclp->len, rsclp->len + rclp->len);
+ rcu_segcblist_add_len(rsclp, rclp->len);
rclp->len_lazy = 0;
rclp->len = 0;
}
@@ -255,10 +379,10 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp,
if (!rclp->head)
return; /* No callbacks to move. */
*rclp->tail = rsclp->head;
- rsclp->head = rclp->head;
+ WRITE_ONCE(rsclp->head, rclp->head);
for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++)
if (&rsclp->head == rsclp->tails[i])
- rsclp->tails[i] = rclp->tail;
+ WRITE_ONCE(rsclp->tails[i], rclp->tail);
else
break;
rclp->head = NULL;
@@ -274,8 +398,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
{
if (!rclp->head)
return; /* Nothing to do. */
- *rsclp->tails[RCU_NEXT_TAIL] = rclp->head;
- rsclp->tails[RCU_NEXT_TAIL] = rclp->tail;
+ WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
+ WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
rclp->head = NULL;
rclp->tail = &rclp->head;
}
@@ -299,7 +423,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) {
if (ULONG_CMP_LT(seq, rsclp->gp_seq[i]))
break;
- rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i];
+ WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]);
}
/* If no callbacks moved, nothing more need be done. */
@@ -308,7 +432,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
/* Clean up tail pointers that might have been misordered above. */
for (j = RCU_WAIT_TAIL; j < i; j++)
- rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL];
+ WRITE_ONCE(rsclp->tails[j], rsclp->tails[RCU_DONE_TAIL]);
/*
* Callbacks moved, so clean up the misordered ->tails[] pointers
@@ -319,7 +443,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq)
for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) {
if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL])
break; /* No more callbacks. */
- rsclp->tails[j] = rsclp->tails[i];
+ WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]);
rsclp->gp_seq[j] = rsclp->gp_seq[i];
}
}
@@ -384,7 +508,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
* structure other than in the RCU_NEXT_TAIL segment.
*/
for (; i < RCU_NEXT_TAIL; i++) {
- rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL];
+ WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_NEXT_TAIL]);
rsclp->gp_seq[i] = seq;
}
return true;
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 71b64648464e..815c2fdd3fcc 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -9,6 +9,12 @@
#include <linux/rcu_segcblist.h>
+/* Return number of callbacks in the specified callback list. */
+static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp)
+{
+ return READ_ONCE(rclp->len);
+}
+
/*
* Account for the fact that a previously dequeued callback turned out
* to be marked as lazy.
@@ -19,6 +25,10 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
}
void rcu_cblist_init(struct rcu_cblist *rclp);
+void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp);
+void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp,
+ struct rcu_cblist *srclp,
+ struct rcu_head *rhp);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
@@ -36,13 +46,17 @@ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
*/
static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp)
{
- return !rsclp->head;
+ return !READ_ONCE(rsclp->head);
}
/* Return number of callbacks in segmented callback list. */
static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
{
+#ifdef CONFIG_RCU_NOCB_CPU
+ return atomic_long_read(&rsclp->len);
+#else
return READ_ONCE(rsclp->len);
+#endif
}
/* Return number of lazy callbacks in segmented callback list. */
@@ -54,16 +68,22 @@ static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp)
/* Return number of lazy callbacks in segmented callback list. */
static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp)
{
- return rsclp->len - rsclp->len_lazy;
+ return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy;
}
/*
* Is the specified rcu_segcblist enabled, for example, not corresponding
- * to an offline or callback-offloaded CPU?
+ * to an offline CPU?
*/
static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
{
- return !!rsclp->tails[RCU_NEXT_TAIL];
+ return rsclp->enabled;
+}
+
+/* Is the specified rcu_segcblist offloaded? */
+static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
+{
+ return rsclp->offloaded;
}
/*
@@ -73,36 +93,18 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
*/
static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
{
- return !*rsclp->tails[seg];
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer. Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp)
-{
- return rsclp->head;
-}
-
-/*
- * Interim function to return rcu_segcblist head pointer. Longer term, the
- * rcu_segcblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
-{
- WARN_ON_ONCE(rcu_segcblist_empty(rsclp));
- return rsclp->tails[RCU_NEXT_TAIL];
+ return !READ_ONCE(*READ_ONCE(rsclp->tails[seg]));
}
+void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp);
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
+void rcu_segcblist_offload(struct rcu_segcblist *rsclp);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
+bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7a6890b23c5f..5a879d073c1c 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable
static char *perf_type = "rcu";
module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)");
static int nrealreaders;
static int nrealwriters;
@@ -375,6 +375,14 @@ rcu_perf_writer(void *arg)
if (holdoff)
schedule_timeout_uninterruptible(holdoff * HZ);
+ /*
+ * Wait until rcu_end_inkernel_boot() is called for normal GP tests
+ * so that RCU is not always expedited for normal GP tests.
+ * The system_state test is approximate, but works well in practice.
+ */
+ while (!gp_exp && system_state != SYSTEM_RUNNING)
+ schedule_timeout_uninterruptible(1);
+
t = ktime_get_mono_fast_ns();
if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index fce4e7e6f502..3c9feca1eab1 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -161,6 +161,7 @@ static atomic_long_t n_rcu_torture_timers;
static long n_barrier_attempts;
static long n_barrier_successes; /* did rcu_barrier test succeed? */
static struct list_head rcu_torture_removed;
+static unsigned long shutdown_jiffies;
static int rcu_torture_writer_state;
#define RTWS_FIXED_DELAY 0
@@ -228,6 +229,15 @@ static u64 notrace rcu_trace_clock_local(void)
}
#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/*
+ * Stop aggressive CPU-hog tests a bit before the end of the test in order
+ * to avoid interfering with test shutdown.
+ */
+static bool shutdown_time_arrived(void)
+{
+ return shutdown_secs && time_after(jiffies, shutdown_jiffies - 30 * HZ);
+}
+
static unsigned long boost_starttime; /* jiffies of next boost test start. */
static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
@@ -1713,12 +1723,14 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
}
// Give the scheduler a chance, even on nohz_full CPUs.
-static void rcu_torture_fwd_prog_cond_resched(void)
+static void rcu_torture_fwd_prog_cond_resched(unsigned long iter)
{
if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
- if (need_resched())
+ // Real call_rcu() floods hit userspace, so emulate that.
+ if (need_resched() || (iter & 0xfff))
schedule();
} else {
+ // No userspace emulation: CB invocation throttles call_rcu()
cond_resched();
}
}
@@ -1746,7 +1758,7 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
kfree(rfcp);
freed++;
- rcu_torture_fwd_prog_cond_resched();
+ rcu_torture_fwd_prog_cond_resched(freed);
}
return freed;
}
@@ -1785,15 +1797,17 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
WRITE_ONCE(rcu_fwd_startat, jiffies);
stopat = rcu_fwd_startat + dur;
while (time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
idx = cur_ops->readlock();
udelay(10);
cur_ops->readunlock(idx);
if (!fwd_progress_need_resched || need_resched())
- rcu_torture_fwd_prog_cond_resched();
+ rcu_torture_fwd_prog_cond_resched(1);
}
(*tested_tries)++;
if (!time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
(*tested)++;
cver = READ_ONCE(rcu_torture_current_version) - cver;
@@ -1852,6 +1866,7 @@ static void rcu_torture_fwd_prog_cr(void)
gps = cur_ops->get_gp_seq();
rcu_launder_gp_seq_start = gps;
while (time_before(jiffies, stopat) &&
+ !shutdown_time_arrived() &&
!READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) {
rfcp = READ_ONCE(rcu_fwd_cb_head);
rfcpn = NULL;
@@ -1875,7 +1890,7 @@ static void rcu_torture_fwd_prog_cr(void)
rfcp->rfc_gps = 0;
}
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
- rcu_torture_fwd_prog_cond_resched();
+ rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs);
}
stoppedat = jiffies;
n_launders_cb_snap = READ_ONCE(n_launders_cb);
@@ -1884,7 +1899,8 @@ static void rcu_torture_fwd_prog_cr(void)
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree();
- if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
+ if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) &&
+ !shutdown_time_arrived()) {
WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
__func__,
@@ -2160,6 +2176,7 @@ rcu_torture_cleanup(void)
return;
}
+ show_rcu_gp_kthreads();
rcu_torture_barrier_cleanup();
torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2465,6 +2482,7 @@ rcu_torture_init(void)
goto unwind;
rcutor_hp = firsterr;
}
+ shutdown_jiffies = jiffies + shutdown_secs * HZ;
firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
if (firsterr)
goto unwind;
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index cf0e886314f2..5dffade2d7cd 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1279,8 +1279,9 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
c0 = l0 - u0;
c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld %1p)",
- cpu, c0, c1, rcu_segcblist_head(&sdp->srcu_cblist));
+ pr_cont(" %d(%ld,%ld %c)",
+ cpu, c0, c1,
+ "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]);
s0 += c0;
s1 += c1;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a14e5fbbea46..81105141b6a8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -56,6 +56,7 @@
#include <linux/smpboot.h>
#include <linux/jiffies.h>
#include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
#include "../time/tick-internal.h"
#include "tree.h"
@@ -210,9 +211,9 @@ static long rcu_get_n_cbs_cpu(int cpu)
{
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_is_enabled(&rdp->cblist)) /* Online normal CPU? */
+ if (rcu_segcblist_is_enabled(&rdp->cblist))
return rcu_segcblist_n_cbs(&rdp->cblist);
- return rcu_get_n_cbs_nocb_cpu(rdp); /* Works for offline, too. */
+ return 0;
}
void rcu_softirq_qs(void)
@@ -416,6 +417,12 @@ module_param(qlowmark, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
static bool rcu_kick_kthreads;
+static int rcu_divisor = 7;
+module_param(rcu_divisor, int, 0644);
+
+/* Force an exit from rcu_do_batch() after 3 milliseconds. */
+static long rcu_resched_ns = 3 * NSEC_PER_MSEC;
+module_param(rcu_resched_ns, long, 0644);
/*
* How long the grace period must be before we start recruiting
@@ -1251,6 +1258,7 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
unsigned long gp_seq_req;
bool ret = false;
+ rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@ -1292,7 +1300,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
unsigned long c;
bool needwake;
- lockdep_assert_irqs_disabled();
+ rcu_lockdep_assert_cblist_protected(rdp);
c = rcu_seq_snap(&rcu_state.gp_seq);
if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
/* Old request still live, so mark recent callbacks. */
@@ -1318,6 +1326,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
*/
static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
{
+ rcu_lockdep_assert_cblist_protected(rdp);
raw_lockdep_assert_held_rcu_node(rnp);
/* If no pending (not yet ready to invoke) callbacks, nothing to do. */
@@ -1335,6 +1344,21 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
}
/*
+ * Move and classify callbacks, but only if doing so won't require
+ * that the RCU grace-period kthread be awakened.
+ */
+static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
+ struct rcu_data *rdp)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
+ !raw_spin_trylock_rcu_node(rnp))
+ return;
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+/*
* Update CPU-local rcu_data state to record the beginnings and ends of
* grace periods. The caller must hold the ->lock of the leaf rcu_node
* structure corresponding to the current CPU, and must have irqs disabled.
@@ -1342,8 +1366,10 @@ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
- bool ret;
+ bool ret = false;
bool need_gp;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1353,10 +1379,12 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Handle the ends of any preceding grace periods first. */
if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
unlikely(READ_ONCE(rdp->gpwrap))) {
- ret = rcu_advance_cbs(rnp, rdp); /* Advance callbacks. */
+ if (!offloaded)
+ ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
} else {
- ret = rcu_accelerate_cbs(rnp, rdp); /* Recent callbacks. */
+ if (!offloaded)
+ ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
}
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
@@ -1657,6 +1685,7 @@ static void rcu_gp_cleanup(void)
unsigned long gp_duration;
bool needgp = false;
unsigned long new_gp_seq;
+ bool offloaded;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root();
struct swait_queue_head *sq;
@@ -1722,7 +1751,9 @@ static void rcu_gp_cleanup(void)
needgp = true;
}
/* Advance CBs to reduce false positives below. */
- if (!rcu_accelerate_cbs(rnp, rdp) && needgp) {
+ offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
+ if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
rcu_state.gp_req_activity = jiffies;
trace_rcu_grace_period(rcu_state.name,
@@ -1881,7 +1912,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
struct rcu_node *rnp_p;
raw_lockdep_assert_held_rcu_node(rnp);
- if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) ||
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
rnp->qsmask != 0) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -1916,7 +1947,9 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake;
+ bool needwake = false;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_node *rnp;
rnp = rdp->mynode;
@@ -1943,7 +1976,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
* This GP can't end until cpu checks in, so all of our
* callbacks can be processed during the next GP.
*/
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ if (!offloaded)
+ needwake = rcu_accelerate_cbs(rnp, rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
@@ -2077,9 +2111,12 @@ int rcutree_dead_cpu(unsigned int cpu)
static void rcu_do_batch(struct rcu_data *rdp)
{
unsigned long flags;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count;
+ long pending, tlimit = 0;
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
@@ -2099,13 +2136,19 @@ static void rcu_do_batch(struct rcu_data *rdp)
* callback counts, as rcu_barrier() needs to be conservative.
*/
local_irq_save(flags);
+ rcu_nocb_lock(rdp);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
- bl = rdp->blimit;
+ pending = rcu_segcblist_n_cbs(&rdp->cblist);
+ bl = max(rdp->blimit, pending >> rcu_divisor);
+ if (unlikely(bl > 100))
+ tlimit = local_clock() + rcu_resched_ns;
trace_rcu_batch_start(rcu_state.name,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
- local_irq_restore(flags);
+ if (offloaded)
+ rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
/* Invoke callbacks. */
rhp = rcu_cblist_dequeue(&rcl);
@@ -2117,13 +2160,29 @@ static void rcu_do_batch(struct rcu_data *rdp)
* Stop only if limit reached and CPU has something to do.
* Note: The rcl structure counts down from zero.
*/
- if (-rcl.len >= bl &&
+ if (-rcl.len >= bl && !offloaded &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
+ if (unlikely(tlimit)) {
+ /* only call local_clock() every 32 callbacks */
+ if (likely((-rcl.len & 31) || local_clock() < tlimit))
+ continue;
+ /* Exceeded the time limit, so leave. */
+ break;
+ }
+ if (offloaded) {
+ WARN_ON_ONCE(in_serving_softirq());
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ cond_resched_tasks_rcu_qs();
+ lockdep_assert_irqs_enabled();
+ local_bh_disable();
+ }
}
local_irq_save(flags);
+ rcu_nocb_lock(rdp);
count = -rcl.len;
trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
@@ -2149,12 +2208,14 @@ static void rcu_do_batch(struct rcu_data *rdp)
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
- WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
+ WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist));
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ count != 0 && rcu_segcblist_empty(&rdp->cblist));
- local_irq_restore(flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
/* Re-invoke RCU core processing if there are callbacks remaining. */
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_core();
}
@@ -2205,7 +2266,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
mask = 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->qsmask == 0) {
- if (!IS_ENABLED(CONFIG_PREEMPT) ||
+ if (!IS_ENABLED(CONFIG_PREEMPTION) ||
rcu_preempt_blocked_readers_cgp(rnp)) {
/*
* No point in scanning bits because they
@@ -2280,6 +2341,8 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
+ const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ rcu_segcblist_is_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2299,7 +2362,7 @@ static __latent_entropy void rcu_core(void)
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress() &&
- rcu_segcblist_is_enabled(&rdp->cblist)) {
+ rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) {
local_irq_save(flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rnp, rdp);
@@ -2309,7 +2372,7 @@ static __latent_entropy void rcu_core(void)
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) &&
likely(READ_ONCE(rcu_scheduler_fully_active)))
rcu_do_batch(rdp);
@@ -2489,10 +2552,11 @@ static void rcu_leak_callback(struct rcu_head *rhp)
* is expected to specify a CPU.
*/
static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
+__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy)
{
unsigned long flags;
struct rcu_data *rdp;
+ bool was_alldone;
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
@@ -2514,28 +2578,18 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
- if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
- int offline;
-
- if (cpu != -1)
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (likely(rdp->mynode)) {
- /* Post-boot, so this should be for a no-CBs CPU. */
- offline = !__call_rcu_nocb(rdp, head, lazy, flags);
- WARN_ON_ONCE(offline);
- /* Offline CPU, _call_rcu() illegal, leak callback. */
- local_irq_restore(flags);
- return;
- }
- /*
- * Very early boot, before rcu_init(). Initialize if needed
- * and then drop through to queue the callback.
- */
- WARN_ON_ONCE(cpu != -1);
+ if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) {
+ // This can trigger due to call_rcu() from offline CPU:
+ WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE);
WARN_ON_ONCE(!rcu_is_watching());
+ // Very early boot, before rcu_init(). Initialize if needed
+ // and then drop through to queue the callback.
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
}
+ if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags))
+ return; // Enqueued onto ->nocb_bypass, so just leave.
+ /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rcu_state.name, head,
@@ -2548,8 +2602,13 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
- __call_rcu_core(rdp, head, flags);
- local_irq_restore(flags);
+ if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
+ unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
+ __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
+ } else {
+ __call_rcu_core(rdp, head, flags);
+ local_irq_restore(flags);
+ }
}
/**
@@ -2589,7 +2648,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, -1, 0);
+ __call_rcu(head, func, 0);
}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -2602,7 +2661,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
{
- __call_rcu(head, func, -1, 1);
+ __call_rcu(head, func, 1);
}
EXPORT_SYMBOL_GPL(kfree_call_rcu);
@@ -2622,7 +2681,7 @@ static int rcu_blocking_is_gp(void)
{
int ret;
- if (IS_ENABLED(CONFIG_PREEMPT))
+ if (IS_ENABLED(CONFIG_PREEMPTION))
return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
might_sleep(); /* Check for RCU read-side critical section. */
preempt_disable();
@@ -2735,6 +2794,10 @@ static int rcu_pending(void)
/* Check for CPU stalls, if enabled. */
check_cpu_stall(rdp);
+ /* Does this CPU need a deferred NOCB wakeup? */
+ if (rcu_nocb_need_deferred_wakeup(rdp))
+ return 1;
+
/* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */
if (rcu_nohz_full_cpu())
return 0;
@@ -2750,6 +2813,8 @@ static int rcu_pending(void)
/* Has RCU gone idle with this CPU needing another grace period? */
if (!rcu_gp_in_progress() &&
rcu_segcblist_is_enabled(&rdp->cblist) &&
+ (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
+ !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
@@ -2758,10 +2823,6 @@ static int rcu_pending(void)
unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */
return 1;
- /* Does this CPU need a deferred NOCB wakeup? */
- if (rcu_nocb_need_deferred_wakeup(rdp))
- return 1;
-
/* nothing to do */
return 0;
}
@@ -2801,6 +2862,8 @@ static void rcu_barrier_func(void *unused)
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
@@ -2808,6 +2871,7 @@ static void rcu_barrier_func(void *unused)
rcu_barrier_trace(TPS("IRQNQ"), -1,
rcu_state.barrier_sequence);
}
+ rcu_nocb_unlock(rdp);
}
/**
@@ -2858,22 +2922,11 @@ void rcu_barrier(void)
* corresponding CPU's preceding callbacks have been invoked.
*/
for_each_possible_cpu(cpu) {
- if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
- continue;
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_is_nocb_cpu(cpu)) {
- if (!rcu_nocb_cpu_needs_barrier(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCB"), cpu,
- rcu_state.barrier_sequence);
- } else {
- rcu_barrier_trace(TPS("OnlineNoCB"), cpu,
- rcu_state.barrier_sequence);
- smp_mb__before_atomic();
- atomic_inc(&rcu_state.barrier_cpu_count);
- __call_rcu(&rdp->barrier_head,
- rcu_barrier_callback, cpu, 0);
- }
- } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
+ if (!cpu_online(cpu) &&
+ !rcu_segcblist_is_offloaded(&rdp->cblist))
+ continue;
+ if (rcu_segcblist_n_cbs(&rdp->cblist)) {
rcu_barrier_trace(TPS("OnlineQ"), cpu,
rcu_state.barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, NULL, 1);
@@ -2958,7 +3011,8 @@ rcu_boot_init_percpu_data(int cpu)
* Initializes a CPU's per-CPU RCU data. Note that only one online or
* offline event can be happening at a given time. Note also that we can
* accept some slop in the rsp->gp_seq access due to the fact that this
- * CPU cannot possibly have any RCU callbacks in flight yet.
+ * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet.
+ * And any offloaded callbacks are being numbered elsewhere.
*/
int rcutree_prepare_cpu(unsigned int cpu)
{
@@ -2972,7 +3026,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->n_force_qs_snap = rcu_state.n_force_qs;
rdp->blimit = blimit;
if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
- !init_nocb_callback_list(rdp))
+ !rcu_segcblist_is_offloaded(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */
rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */
rcu_dynticks_eqs_online();
@@ -3151,29 +3205,38 @@ void rcutree_migrate_callbacks(int cpu)
{
unsigned long flags;
struct rcu_data *my_rdp;
+ struct rcu_node *my_rnp;
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_node *rnp_root = rcu_get_root();
bool needwake;
- if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
local_irq_save(flags);
my_rdp = this_cpu_ptr(&rcu_data);
- if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
- local_irq_restore(flags);
- return;
- }
- raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ my_rnp = my_rdp->mynode;
+ rcu_nocb_lock(my_rdp); /* irqs already disabled. */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies));
+ raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */
/* Leverage recent GPs and set GP for new callbacks. */
- needwake = rcu_advance_cbs(rnp_root, rdp) ||
- rcu_advance_cbs(rnp_root, my_rdp);
+ needwake = rcu_advance_cbs(my_rnp, rdp) ||
+ rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
+ rcu_segcblist_disable(&rdp->cblist);
WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
!rcu_segcblist_n_cbs(&my_rdp->cblist));
- raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) {
+ raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
+ __call_rcu_nocb_wake(my_rdp, true, flags);
+ } else {
+ rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */
+ raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags);
+ }
if (needwake)
rcu_gp_kthread_wake();
+ lockdep_assert_irqs_enabled();
WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
!rcu_segcblist_empty(&rdp->cblist),
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
@@ -3234,13 +3297,13 @@ static int __init rcu_spawn_gp_kthread(void)
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
return 0;
- rnp = rcu_get_root();
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
- rcu_state.gp_kthread = t;
if (kthread_prio) {
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
+ rnp = rcu_get_root();
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rcu_state.gp_kthread = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7acaf3a62d39..c612f306fe89 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -194,29 +194,38 @@ struct rcu_data {
/* 5) Callback offloading. */
#ifdef CONFIG_RCU_NOCB_CPU
- struct rcu_head *nocb_head; /* CBs waiting for kthread. */
- struct rcu_head **nocb_tail;
- atomic_long_t nocb_q_count; /* # CBs waiting for nocb */
- atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
- struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
- struct rcu_head **nocb_follower_tail;
- struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
- struct task_struct *nocb_kthread;
+ struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */
+ struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
+ atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
-
- /* The following fields are used by the leader, hence own cacheline. */
- struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
- /* CBs waiting for GP. */
- struct rcu_head **nocb_gp_tail;
- bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
- struct rcu_data *nocb_next_follower;
- /* Next follower in wakeup chain. */
-
- /* The following fields are used by the follower, hence new cachline. */
- struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
- /* Leader CPU takes GP-end wakeups. */
+ unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+
+ /* The following fields are used by call_rcu, hence own cacheline. */
+ raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
+ struct rcu_cblist nocb_bypass; /* Lock-contention-bypass CB list. */
+ unsigned long nocb_bypass_first; /* Time (jiffies) of first enqueue. */
+ unsigned long nocb_nobypass_last; /* Last ->cblist enqueue (jiffies). */
+ int nocb_nobypass_count; /* # ->cblist enqueues at ^^^ time. */
+
+ /* The following fields are used by GP kthread, hence own cacheline. */
+ raw_spinlock_t nocb_gp_lock ____cacheline_internodealigned_in_smp;
+ struct timer_list nocb_bypass_timer; /* Force nocb_bypass flush. */
+ u8 nocb_gp_sleep; /* Is the nocb GP thread asleep? */
+ u8 nocb_gp_bypass; /* Found a bypass on last scan? */
+ u8 nocb_gp_gp; /* GP to wait for on last scan? */
+ unsigned long nocb_gp_seq; /* If so, ->gp_seq to wait for. */
+ unsigned long nocb_gp_loops; /* # passes through wait code. */
+ struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */
+ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */
+ struct task_struct *nocb_cb_kthread;
+ struct rcu_data *nocb_next_cb_rdp;
+ /* Next rcu_data in wakeup chain. */
+
+ /* The following fields are used by CB kthread, hence new cacheline. */
+ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp;
+ /* GP rdp takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 6) RCU priority boosting. */
@@ -419,25 +428,39 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
static void rcu_preempt_deferred_qs(struct task_struct *t);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
-static bool rcu_nocb_cpu_needs_barrier(int cpu);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags);
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j);
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags);
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_cpu_nocb_kthread(int cpu);
static void __init rcu_spawn_nocb_kthreads(void);
+static void show_rcu_nocb_state(struct rcu_data *rdp);
+static void rcu_nocb_lock(struct rcu_data *rdp);
+static void rcu_nocb_unlock(struct rcu_data *rdp);
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags);
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
#ifdef CONFIG_RCU_NOCB_CPU
static void __init rcu_organize_nocb_kthreads(void);
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-static bool init_nocb_callback_list(struct rcu_data *rdp);
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp);
+#define rcu_nocb_lock_irqsave(rdp, flags) \
+do { \
+ if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \
+ local_irq_save(flags); \
+ else \
+ raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \
+} while (0)
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+#define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags)
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
static void rcu_bind_gp_kthread(void);
static bool rcu_nohz_full_cpu(void);
static void rcu_dynticks_task_enter(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index af7e7b9c86af..d632cd019597 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -781,7 +781,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
* other hand, if the CPU is not in an RCU read-side critical section,
* the IPI handler reports the quiescent state immediately.
*
- * Although this is a greate improvement over previous expedited
+ * Although this is a great improvement over previous expedited
* implementations, it is still unfriendly to real-time workloads, so is
* thus not recommended for any sort of common-case code. In fact, if
* you are using synchronize_rcu_expedited() in a loop, please restructure
@@ -792,6 +792,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
+ bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -817,7 +818,7 @@ void synchronize_rcu_expedited(void)
return; /* Someone else did our work for us. */
/* Ensure that load happens before action based on it. */
- if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
+ if (unlikely(boottime)) {
/* Direct call during scheduler init and early_initcalls(). */
rcu_exp_sel_wait_wake(s);
} else {
@@ -835,5 +836,8 @@ void synchronize_rcu_expedited(void)
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
+
+ if (likely(!boottime))
+ destroy_work_on_stack(&rew.rew_work);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index acb225023ed1..2defc7fe74c3 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt)
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
struct rcu_node *rnp;
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
@@ -314,15 +313,6 @@ void rcu_note_context_switch(bool preempt)
? rnp->gp_seq
: rcu_seq_snap(&rnp->gp_seq));
rcu_preempt_ctxt_queue(rnp, rdp);
- } else if (t->rcu_read_lock_nesting < 0 &&
- t->rcu_read_unlock_special.s) {
-
- /*
- * Complete exit from RCU read-side critical section on
- * behalf of preempted instance of __rcu_read_unlock().
- */
- rcu_read_unlock_special(t);
- rcu_preempt_deferred_qs(t);
} else {
rcu_preempt_deferred_qs(t);
}
@@ -340,7 +330,6 @@ void rcu_note_context_switch(bool preempt)
if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
trace_rcu_utilization(TPS("End context switch"));
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -626,22 +615,18 @@ static void rcu_read_unlock_special(struct task_struct *t)
(rdp->grpmask & rnp->expmask) ||
tick_nohz_full_cpu(rdp->cpu);
// Need to defer quiescent state until everything is enabled.
- if ((exp || in_irq()) && irqs_were_disabled && use_softirq &&
- (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
+ if (irqs_were_disabled && use_softirq &&
+ (in_interrupt() ||
+ (exp && !t->rcu_read_unlock_special.b.deferred_qs))) {
// Using softirq, safe to awaken, and we get
// no help from enabling irqs, unlike bh/preempt.
raise_softirq_irqoff(RCU_SOFTIRQ);
- } else if (exp && irqs_were_disabled && !use_softirq &&
- !t->rcu_read_unlock_special.b.deferred_qs) {
- // Safe to awaken and we get no help from enabling
- // irqs, unlike bh/preempt.
- invoke_rcu_core();
} else {
// Enabling BH or preempt does reschedule, so...
// Also if no expediting or NO_HZ_FULL, slow is OK.
set_tsk_need_resched(current);
set_preempt_need_resched();
- if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
!rdp->defer_qs_iw_pending && exp) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
@@ -828,11 +813,6 @@ static void rcu_qs(void)
* dyntick-idle quiescent state visible to other CPUs, which will in
* some cases serve for expedited as well as normal grace periods.
* Either way, register a lightweight quiescent state.
- *
- * The barrier() calls are redundant in the common case when this is
- * called externally, but just in case this is called from within this
- * file.
- *
*/
void rcu_all_qs(void)
{
@@ -847,14 +827,12 @@ void rcu_all_qs(void)
return;
}
this_cpu_write(rcu_data.rcu_urgent_qs, false);
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
rcu_qs();
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
preempt_enable();
}
EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -864,7 +842,6 @@ EXPORT_SYMBOL_GPL(rcu_all_qs);
*/
void rcu_note_context_switch(bool preempt)
{
- barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
rcu_qs();
/* Load rcu_urgent_qs before other flags. */
@@ -877,7 +854,6 @@ void rcu_note_context_switch(bool preempt)
rcu_tasks_qs(current);
out:
trace_rcu_utilization(TPS("End context switch"));
- barrier(); /* Avoid RCU read-side critical sections leaking up. */
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -1134,7 +1110,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* already exist. We only create this kthread for preemptible RCU.
* Returns zero if all is well, a negated errno otherwise.
*/
-static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
+static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
int rnp_index = rnp - rcu_get_root();
unsigned long flags;
@@ -1142,25 +1118,27 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct task_struct *t;
if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
- return 0;
+ return;
if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
- return 0;
+ return;
rcu_state.boost = 1;
+
if (rnp->boost_kthread_task != NULL)
- return 0;
+ return;
+
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
+ if (WARN_ON_ONCE(IS_ERR(t)))
+ return;
+
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- return 0;
}
/*
@@ -1201,7 +1179,7 @@ static void __init rcu_spawn_boost_kthreads(void)
struct rcu_node *rnp;
rcu_for_each_leaf_node(rnp)
- (void)rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_one_boost_kthread(rnp);
}
static void rcu_prepare_kthreads(int cpu)
@@ -1211,7 +1189,7 @@ static void rcu_prepare_kthreads(int cpu)
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
if (rcu_scheduler_fully_active)
- (void)rcu_spawn_one_boost_kthread(rnp);
+ rcu_spawn_one_boost_kthread(rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1248,10 +1226,10 @@ static void rcu_prepare_kthreads(int cpu)
#if !defined(CONFIG_RCU_FAST_NO_HZ)
/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
+ * Check to see if any future non-offloaded RCU-related work will need
+ * to be done by the current CPU, even if none need be done immediately,
+ * returning 1 if so. This function is part of the RCU implementation;
+ * it is -not- an exported member of the RCU API.
*
* Because we not have RCU_FAST_NO_HZ, just check whether or not this
* CPU has RCU callbacks queued.
@@ -1259,7 +1237,8 @@ static void rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(u64 basemono, u64 *nextevt)
{
*nextevt = KTIME_MAX;
- return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist);
+ return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
+ !rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist);
}
/*
@@ -1360,8 +1339,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
lockdep_assert_irqs_disabled();
- /* If no callbacks, RCU doesn't need the CPU. */
- if (rcu_segcblist_empty(&rdp->cblist)) {
+ /* If no non-offloaded callbacks, RCU doesn't need the CPU. */
+ if (rcu_segcblist_empty(&rdp->cblist) ||
+ rcu_segcblist_is_offloaded(&this_cpu_ptr(&rcu_data)->cblist)) {
*nextevt = KTIME_MAX;
return 0;
}
@@ -1404,7 +1384,7 @@ static void rcu_prepare_for_idle(void)
int tne;
lockdep_assert_irqs_disabled();
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
return;
/* Handle nohz enablement switches conservatively. */
@@ -1453,8 +1433,10 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+
lockdep_assert_irqs_disabled();
- if (rcu_is_nocb_cpu(smp_processor_id()))
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -1469,10 +1451,10 @@ static void rcu_cleanup_after_idle(void)
* specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
* created that pull the callbacks from the corresponding CPU, wait for
* a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into leaders, which manage incoming callbacks, wait for
- * grace periods, and awaken followers, and the followers, which only
- * invoke callbacks. Each leader is its own follower. The no-CBs CPUs
- * do a wake_up() on their kthread when they insert a callback into any
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
* empty list, unless the rcu_nocb_poll boot parameter has been specified,
* in which case each kthread actively polls its CPU. (Which isn't so great
* for energy efficiency, but which does reduce RCU's overhead on that CPU.)
@@ -1515,6 +1497,116 @@ static int __init parse_rcu_nocb_poll(char *arg)
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_segcblist_is_offloaded(&rdp->cblist) &&
+ cpu_online(rdp->cpu))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
@@ -1543,440 +1635,514 @@ bool rcu_is_nocb_cpu(int cpu)
}
/*
- * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
+ * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock
* and this function releases it.
*/
-static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
- unsigned long flags)
+static void wake_nocb_gp(struct rcu_data *rdp, bool force,
+ unsigned long flags)
__releases(rdp->nocb_lock)
{
- struct rcu_data *rdp_leader = rdp->nocb_leader;
+ bool needwake = false;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
lockdep_assert_held(&rdp->nocb_lock);
- if (!READ_ONCE(rdp_leader->nocb_kthread)) {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
- if (rdp_leader->nocb_leader_sleep || force) {
- /* Prior smp_mb__after_atomic() orders against prior enqueue. */
- WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
- del_timer(&rdp->nocb_timer);
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
- swake_up_one(&rdp_leader->nocb_wq);
- } else {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ del_timer(&rdp->nocb_timer);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
}
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
}
/*
- * Kick the leader kthread for this NOCB group, but caller has not
- * acquired locks.
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
*/
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
{
- unsigned long flags;
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+ mod_timer(&rdp->nocb_timer, jiffies + 1);
+ if (rdp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ struct rcu_cblist rcl;
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- __wake_nocb_leader(rdp, force, flags);
+ WARN_ON_ONCE(!rcu_segcblist_is_offloaded(&rdp->cblist));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
}
/*
- * Arrange to wake the leader kthread for this NOCB group at some
- * future time when it is safe to do so.
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
*/
-static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
{
- unsigned long flags;
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+}
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
- mod_timer(&rdp->nocb_timer, jiffies + 1);
- WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
}
-/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */
-static bool rcu_nocb_cpu_needs_barrier(int cpu)
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- unsigned long ret;
-#ifdef CONFIG_PROVE_RCU
- struct rcu_head *rhp;
-#endif /* #ifdef CONFIG_PROVE_RCU */
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- /*
- * Check count of all no-CBs callbacks awaiting invocation.
- * There needs to be a barrier before this function is called,
- * but associated with a prior determination that no more
- * callbacks would be posted. In the worst case, the first
- * barrier in rcu_barrier() suffices (but the caller cannot
- * necessarily rely on this, not a substitute for the caller
- * getting the concurrency design right!). There must also be a
- * barrier between the following load and posting of a callback
- * (if a callback is in fact needed). This is associated with an
- * atomic_inc() in the caller.
- */
- ret = rcu_get_n_cbs_nocb_cpu(rdp);
-
-#ifdef CONFIG_PROVE_RCU
- rhp = READ_ONCE(rdp->nocb_head);
- if (!rhp)
- rhp = READ_ONCE(rdp->nocb_gp_head);
- if (!rhp)
- rhp = READ_ONCE(rdp->nocb_follower_head);
-
- /* Having no rcuo kthread but CBs after scheduler starts is bad! */
- if (!READ_ONCE(rdp->nocb_kthread) && rhp &&
- rcu_scheduler_fully_active) {
- /* RCU callback enqueued before CPU first came online??? */
- pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
- cpu, rhp->func);
- WARN_ON_ONCE(1);
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+ lockdep_assert_irqs_disabled();
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return true; // Callback already enqueued.
}
-#endif /* #ifdef CONFIG_PROVE_RCU */
- return !!ret;
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ if (ncbs) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
}
/*
- * Enqueue the specified string of rcu_head structures onto the specified
- * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
- * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
- * counts are supplied by rhcount and rhcount_lazy.
+ * Awaken the no-CBs grace-period kthead if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
*
* If warranted, also wake up the kthread servicing this CPUs queues.
*/
-static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
- struct rcu_head *rhp,
- struct rcu_head **rhtp,
- int rhcount, int rhcount_lazy,
- unsigned long flags)
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
{
- int len;
- struct rcu_head **old_rhpp;
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long len;
struct task_struct *t;
- /* Enqueue the callback on the nocb list and update counts. */
- atomic_long_add(rhcount, &rdp->nocb_q_count);
- /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
- old_rhpp = xchg(&rdp->nocb_tail, rhtp);
- WRITE_ONCE(*old_rhpp, rhp);
- atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
- smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
-
- /* If we are not being polled and there is a kthread, awaken it ... */
- t = READ_ONCE(rdp->nocb_kthread);
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
if (rcu_nocb_poll || !t) {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeNotPoll"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
- len = rcu_get_n_cbs_nocb_cpu(rdp);
- if (old_rhpp == &rdp->nocb_head) {
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
if (!irqs_disabled_flags(flags)) {
/* ... if queue was empty ... */
- wake_nocb_leader(rdp, false);
+ wake_nocb_gp(rdp, false, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
- rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
/* ... or if many callbacks queued. */
- if (!irqs_disabled_flags(flags)) {
- wake_nocb_leader(rdp, true);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeOvf"));
- } else {
- wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
}
- rdp->qlen_last_fqs_check = LONG_MAX / 2;
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_bypass_timer))
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
} else {
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
}
return;
}
-/*
- * This is a helper for __call_rcu(), which invokes this when the normal
- * callback queue is inoperable. If this is not a no-CBs CPU, this
- * function returns failure back to __call_rcu(), which can complain
- * appropriately.
- *
- * Otherwise, this function queues the callback where the corresponding
- * "rcuo" kthread can find it.
- */
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags)
+/* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
+static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
- if (!rcu_is_nocb_cpu(rdp->cpu))
- return false;
- __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
- if (__is_kfree_rcu_offset((unsigned long)rhp->func))
- trace_rcu_kfree_callback(rcu_state.name, rhp,
- (unsigned long)rhp->func,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -rcu_get_n_cbs_nocb_cpu(rdp));
- else
- trace_rcu_callback(rcu_state.name, rhp,
- -atomic_long_read(&rdp->nocb_q_count_lazy),
- -rcu_get_n_cbs_nocb_cpu(rdp));
-
- /*
- * If called from an extended quiescent state with interrupts
- * disabled, invoke the RCU core in order to allow the idle-entry
- * deferred-wakeup check to function.
- */
- if (irqs_disabled_flags(flags) &&
- !rcu_is_watching() &&
- cpu_online(smp_processor_id()))
- invoke_rcu_core();
-
- return true;
-}
-
-/*
- * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
- * not a no-CBs CPU.
- */
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags)
-{
- lockdep_assert_irqs_disabled();
- if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false; /* Not NOCBs CPU, caller must migrate CBs. */
- __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
- rcu_segcblist_tail(&rdp->cblist),
- rcu_segcblist_n_cbs(&rdp->cblist),
- rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_disable(&rdp->cblist);
- return true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ __call_rcu_nocb_wake(rdp, true, flags);
}
/*
- * If necessary, kick off a new grace period, and either way wait
- * for a subsequent grace period to complete.
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
*/
-static void rcu_nocb_wait_gp(struct rcu_data *rdp)
+static void nocb_gp_wait(struct rcu_data *my_rdp)
{
- unsigned long c;
- bool d;
+ bool bypass = false;
+ long bypass_ncbs;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
unsigned long flags;
+ bool gotcbs;
+ unsigned long j = jiffies;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
bool needwake;
- struct rcu_node *rnp = rdp->mynode;
+ bool needwake_gp;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- local_irq_save(flags);
- c = rcu_seq_snap(&rcu_state.gp_seq);
- if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
- local_irq_restore(flags);
- } else {
- raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- needwake = rcu_start_this_gp(rnp, rdp, c);
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- if (needwake)
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ if (bypass_ncbs &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ continue; /* No callbacks here, try next. */
+ }
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("Bypass"));
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+ if (bypass) { // Avoid race with first bypass CB.
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup,
+ RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
rcu_gp_kthread_wake();
}
- /*
- * Wait for the grace period. Do so interruptibly to avoid messing
- * up the load average.
- */
- trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
- for (;;) {
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+ if (bypass && !rcu_nocb_poll) {
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ schedule_timeout_interruptible(1);
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
- (d = rcu_seq_done(&rnp->gp_seq, c)));
- if (likely(d))
- break;
- WARN_ON(signal_pending(current));
- trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait"));
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
}
- trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait"));
- smp_mb(); /* Ensure that CB invocation happens after GP end. */
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (bypass)
+ del_timer(&my_rdp->nocb_bypass_timer);
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
}
/*
- * Leaders come here to wait for additional callbacks to show up.
- * This function does not return until callbacks appear.
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
*/
-static void nocb_leader_wait(struct rcu_data *my_rdp)
+static int rcu_nocb_gp_kthread(void *arg)
{
- bool firsttime = true;
- unsigned long flags;
- bool gotcbs;
- struct rcu_data *rdp;
- struct rcu_head **tail;
-
-wait_again:
-
- /* Wait for callbacks to appear. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_wq,
- !READ_ONCE(my_rdp->nocb_leader_sleep));
- raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
- my_rdp->nocb_leader_sleep = true;
- WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
- } else if (firsttime) {
- firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll"));
- }
-
- /*
- * Each pass through the following loop checks a follower for CBs.
- * We are our own first follower. Any CBs found are moved to
- * nocb_gp_head, where they await a grace period.
- */
- gotcbs = false;
- smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
- if (!rdp->nocb_gp_head)
- continue; /* No CBs here, try next follower. */
-
- /* Move callbacks to wait-for-GP list, which is empty. */
- WRITE_ONCE(rdp->nocb_head, NULL);
- rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- gotcbs = true;
- }
-
- /* No callbacks? Sleep a bit if polling, and go retry. */
- if (unlikely(!gotcbs)) {
- WARN_ON(signal_pending(current));
- if (rcu_nocb_poll) {
- schedule_timeout_interruptible(1);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu,
- TPS("WokeEmpty"));
- }
- goto wait_again;
- }
+ struct rcu_data *rdp = arg;
- /* Wait for one grace period. */
- rcu_nocb_wait_gp(my_rdp);
-
- /* Each pass through the following loop wakes a follower, if needed. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (!rcu_nocb_poll &&
- READ_ONCE(rdp->nocb_head) &&
- READ_ONCE(my_rdp->nocb_leader_sleep)) {
- raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
- my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
- raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
- }
- if (!rdp->nocb_gp_head)
- continue; /* No CBs, so no need to wake follower. */
-
- /* Append callbacks to follower's "done" list. */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- tail = rdp->nocb_follower_tail;
- rdp->nocb_follower_tail = rdp->nocb_gp_tail;
- *tail = rdp->nocb_gp_head;
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /* List was empty, so wake up the follower. */
- swake_up_one(&rdp->nocb_wq);
- }
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
}
-
- /* If we (the leader) don't have CBs, go wait some more. */
- if (!my_rdp->nocb_follower_head)
- goto wait_again;
+ return 0;
}
/*
- * Followers come here to wait for additional callbacks to show up.
- * This function does not return until callbacks appear.
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
*/
-static void nocb_follower_wait(struct rcu_data *rdp)
+static void nocb_cb_wait(struct rcu_data *rdp)
{
- for (;;) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep"));
- swait_event_interruptible_exclusive(rdp->nocb_wq,
- READ_ONCE(rdp->nocb_follower_head));
- if (smp_load_acquire(&rdp->nocb_follower_head)) {
- /* ^^^ Ensure CB invocation follows _head test. */
- return;
- }
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_gp = false;
+ struct rcu_node *rnp = rdp->mynode;
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ return;
+ }
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ !READ_ONCE(rdp->nocb_cb_sleep));
+ if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */
+ /* ^^^ Ensure CB invocation follows _sleep test. */
+ return;
}
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
}
/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU, however, there is
- * an optional leader-follower relationship so that the grace-period
- * kthreads don't have to do quite so many wakeups.
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
*/
-static int rcu_nocb_kthread(void *arg)
+static int rcu_nocb_cb_kthread(void *arg)
{
- int c, cl;
- unsigned long flags;
- struct rcu_head *list;
- struct rcu_head *next;
- struct rcu_head **tail;
struct rcu_data *rdp = arg;
- /* Each pass through this loop invokes one batch of callbacks */
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
for (;;) {
- /* Wait for callbacks. */
- if (rdp->nocb_leader == rdp)
- nocb_leader_wait(rdp);
- else
- nocb_follower_wait(rdp);
-
- /* Pull the ready-to-invoke callbacks onto local list. */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
- list = rdp->nocb_follower_head;
- rdp->nocb_follower_head = NULL;
- tail = rdp->nocb_follower_tail;
- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- if (WARN_ON_ONCE(!list))
- continue;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty"));
-
- /* Each pass through the following loop invokes a callback. */
- trace_rcu_batch_start(rcu_state.name,
- atomic_long_read(&rdp->nocb_q_count_lazy),
- rcu_get_n_cbs_nocb_cpu(rdp), -1);
- c = cl = 0;
- while (list) {
- next = list->next;
- /* Wait for enqueuing to complete, if needed. */
- while (next == NULL && &list->next != tail) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WaitQueue"));
- schedule_timeout_interruptible(1);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WokeQueue"));
- next = list->next;
- }
- debug_rcu_head_unqueue(list);
- local_bh_disable();
- if (__rcu_reclaim(rcu_state.name, list))
- cl++;
- c++;
- local_bh_enable();
- cond_resched_tasks_rcu_qs();
- list = next;
- }
- trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1);
- smp_mb__before_atomic(); /* _add after CB invocation. */
- atomic_long_add(-c, &rdp->nocb_q_count);
- atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
}
return 0;
}
@@ -1993,14 +2159,14 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
unsigned long flags;
int ndw;
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ rcu_nocb_lock_irqsave(rdp, flags);
if (!rcu_nocb_need_deferred_wakeup(rdp)) {
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ rcu_nocb_unlock_irqrestore(rdp, flags);
return;
}
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
}
@@ -2027,6 +2193,7 @@ void __init rcu_init_nohz(void)
{
int cpu;
bool need_rcu_nocb_mask = false;
+ struct rcu_data *rdp;
#if defined(CONFIG_NO_HZ_FULL)
if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
@@ -2060,67 +2227,63 @@ void __init rcu_init_nohz(void)
if (rcu_nocb_poll)
pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
- for_each_cpu(cpu, rcu_nocb_mask)
- init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu));
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist);
+ }
rcu_organize_nocb_kthreads();
}
/* Initialize per-rcu_data variables for no-CBs CPUs. */
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
- rdp->nocb_tail = &rdp->nocb_head;
- init_swait_queue_head(&rdp->nocb_wq);
- rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
}
/*
* If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it. If the CPUs are brought online out of order,
- * this can require re-organizing the leader-follower relationships.
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
*/
static void rcu_spawn_one_nocb_kthread(int cpu)
{
- struct rcu_data *rdp;
- struct rcu_data *rdp_last;
- struct rcu_data *rdp_old_leader;
- struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
struct task_struct *t;
/*
* If this isn't a no-CBs CPU or if it already has an rcuo kthread,
* then nothing to do.
*/
- if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
+ if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
return;
- /* If we didn't spawn the leader first, reorganize! */
- rdp_old_leader = rdp_spawn->nocb_leader;
- if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
- rdp_last = NULL;
- rdp = rdp_old_leader;
- do {
- rdp->nocb_leader = rdp_spawn;
- if (rdp_last && rdp != rdp_spawn)
- rdp_last->nocb_next_follower = rdp;
- if (rdp == rdp_spawn) {
- rdp = rdp->nocb_next_follower;
- } else {
- rdp_last = rdp;
- rdp = rdp->nocb_next_follower;
- rdp_last->nocb_next_follower = NULL;
- }
- } while (rdp);
- rdp_spawn->nocb_next_follower = rdp_old_leader;
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ rdp_gp = rdp->nocb_gp_rdp;
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
}
/* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_kthread, rdp_spawn,
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__))
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
return;
- WRITE_ONCE(rdp_spawn->nocb_kthread, t);
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
}
/*
@@ -2147,27 +2310,28 @@ static void __init rcu_spawn_nocb_kthreads(void)
rcu_spawn_cpu_nocb_kthread(cpu);
}
-/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_leader_stride = -1;
-module_param(rcu_nocb_leader_stride, int, 0444);
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
/*
- * Initialize leader-follower relationships for all no-CBs CPU.
+ * Initialize GP-CB relationships for all no-CBs CPU.
*/
static void __init rcu_organize_nocb_kthreads(void)
{
int cpu;
- int ls = rcu_nocb_leader_stride;
- int nl = 0; /* Next leader. */
+ bool firsttime = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
struct rcu_data *rdp;
- struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
if (!cpumask_available(rcu_nocb_mask))
return;
if (ls == -1) {
- ls = int_sqrt(nr_cpu_ids);
- rcu_nocb_leader_stride = ls;
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
}
/*
@@ -2178,39 +2342,24 @@ static void __init rcu_organize_nocb_kthreads(void)
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->cpu >= nl) {
- /* New leader, set up for followers & next leader. */
+ /* New GP kthread, set up for CBs & next GP. */
nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_leader = rdp;
- rdp_leader = rdp;
+ rdp->nocb_gp_rdp = rdp;
+ rdp_gp = rdp;
+ if (!firsttime && dump_tree)
+ pr_cont("\n");
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:", __func__, cpu);
} else {
- /* Another follower, link to previous leader. */
- rdp->nocb_leader = rdp_leader;
- rdp_prev->nocb_next_follower = rdp;
+ /* Another CB kthread, link to previous GP kthread. */
+ rdp->nocb_gp_rdp = rdp_gp;
+ rdp_prev->nocb_next_cb_rdp = rdp;
+ pr_alert(" %d", cpu);
}
rdp_prev = rdp;
}
}
-/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static bool init_nocb_callback_list(struct rcu_data *rdp)
-{
- if (!rcu_is_nocb_cpu(rdp->cpu))
- return false;
-
- /* If there are early-boot callbacks, move them to nocb lists. */
- if (!rcu_segcblist_empty(&rdp->cblist)) {
- rdp->nocb_head = rcu_segcblist_head(&rdp->cblist);
- rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist);
- atomic_long_set(&rdp->nocb_q_count,
- rcu_segcblist_n_cbs(&rdp->cblist));
- atomic_long_set(&rdp->nocb_q_count_lazy,
- rcu_segcblist_n_lazy_cbs(&rdp->cblist));
- rcu_segcblist_init(&rdp->cblist);
- }
- rcu_segcblist_disable(&rdp->cblist);
- return true;
-}
-
/*
* Bind the current task to the offloaded CPUs. If there are no offloaded
* CPUs, leave the task unbound. Splat if the bind attempt fails.
@@ -2223,20 +2372,101 @@ void rcu_bind_current_to_nocb(void)
EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
/*
- * Return the number of RCU callbacks still queued from the specified
- * CPU, which must be a nocbs CPU.
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
*/
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
{
- return atomic_long_read(&rdp->nocb_q_count);
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "bB"[timer_pending(&rdp->nocb_bypass_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wastimer;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)],
+ ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)],
+ ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wastimer = timer_pending(&rdp->nocb_timer);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep &&
+ !waslocked && !wastimer && !wassleep)
+ return; /* Nothing untowards. */
+
+ pr_info(" !!! %c%c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[wastimer],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-static bool rcu_nocb_cpu_needs_barrier(int cpu)
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
{
- WARN_ON_ONCE(1); /* Should be dead code. */
- return false;
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
}
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
@@ -2252,19 +2482,24 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}
-static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
- bool lazy, unsigned long flags)
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
{
- return false;
+ return true;
}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
- struct rcu_data *rdp,
- unsigned long flags)
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
{
return false;
}
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
@@ -2286,14 +2521,8 @@ static void __init rcu_spawn_nocb_kthreads(void)
{
}
-static bool init_nocb_callback_list(struct rcu_data *rdp)
-{
- return false;
-}
-
-static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp)
+static void show_rcu_nocb_state(struct rcu_data *rdp)
{
- return 0;
}
#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 065183391f75..c0b8c458d8a6 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)
//
// Printing RCU CPU stall warnings
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* Dump detailed information for all tasks blocking the current RCU
@@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
return ndetected;
}
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPTION */
/*
* Because preemptible RCU does not exist, we never have to check for
@@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
{
return 0;
}
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPTION */
/*
* Dump stacks of all tasks running on stalled CPUs. First try using
@@ -527,6 +527,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
/* We haven't checked in, so go dump stack. */
print_cpu_stall();
+ if (rcu_cpu_stall_ftrace_dump)
+ rcu_ftrace_dump(DUMP_ALL);
} else if (rcu_gp_in_progress() &&
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
@@ -534,6 +536,8 @@ static void check_cpu_stall(struct rcu_data *rdp)
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2);
+ if (rcu_cpu_stall_ftrace_dump)
+ rcu_ftrace_dump(DUMP_ALL);
}
}
@@ -585,6 +589,11 @@ void show_rcu_gp_kthreads(void)
cpu, (long)rdp->gp_seq_needed);
}
}
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_is_offloaded(&rdp->cblist))
+ show_rcu_nocb_state(rdp);
+ }
/* sched_show_task(rcu_state.gp_kthread); */
}
EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 61df2bf08563..1861103662db 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -61,9 +61,15 @@ module_param(rcu_normal_after_boot, int, 0);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/**
- * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
+ * rcu_read_lock_held_common() - might we be in RCU-sched read-side critical section?
+ * @ret: Best guess answer if lockdep cannot be relied on
*
- * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
+ * Returns true if lockdep must be ignored, in which case *ret contains
+ * the best guess described below. Otherwise returns false, in which
+ * case *ret tells the caller nothing and the caller should instead
+ * consult lockdep.
+ *
+ * If CONFIG_DEBUG_LOCK_ALLOC is selected, set *ret to nonzero iff in an
* RCU-sched read-side critical section. In absence of
* CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
* critical section unless it can prove otherwise. Note that disabling
@@ -75,35 +81,45 @@ module_param(rcu_normal_after_boot, int, 0);
* Check debug_lockdep_rcu_enabled() to prevent false positives during boot
* and while lockdep is disabled.
*
- * Note that if the CPU is in the idle loop from an RCU point of
- * view (ie: that we are in the section between rcu_idle_enter() and
- * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
- * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
- * that are in such a section, considering these as in extended quiescent
- * state, so such a CPU is effectively never in an RCU read-side critical
- * section regardless of what RCU primitives it invokes. This state of
- * affairs is required --- we need to keep an RCU-free window in idle
- * where the CPU may possibly enter into low power mode. This way we can
- * notice an extended quiescent state to other CPUs that started a grace
- * period. Otherwise we would delay any grace period as long as we run in
- * the idle task.
+ * Note that if the CPU is in the idle loop from an RCU point of view (ie:
+ * that we are in the section between rcu_idle_enter() and rcu_idle_exit())
+ * then rcu_read_lock_held() sets *ret to false even if the CPU did an
+ * rcu_read_lock(). The reason for this is that RCU ignores CPUs that are
+ * in such a section, considering these as in extended quiescent state,
+ * so such a CPU is effectively never in an RCU read-side critical section
+ * regardless of what RCU primitives it invokes. This state of affairs is
+ * required --- we need to keep an RCU-free window in idle where the CPU may
+ * possibly enter into low power mode. This way we can notice an extended
+ * quiescent state to other CPUs that started a grace period. Otherwise
+ * we would delay any grace period as long as we run in the idle task.
*
- * Similarly, we avoid claiming an SRCU read lock held if the current
+ * Similarly, we avoid claiming an RCU read lock held if the current
* CPU is offline.
*/
+static bool rcu_read_lock_held_common(bool *ret)
+{
+ if (!debug_lockdep_rcu_enabled()) {
+ *ret = 1;
+ return true;
+ }
+ if (!rcu_is_watching()) {
+ *ret = 0;
+ return true;
+ }
+ if (!rcu_lockdep_current_cpu_online()) {
+ *ret = 0;
+ return true;
+ }
+ return false;
+}
+
int rcu_read_lock_sched_held(void)
{
- int lockdep_opinion = 0;
+ bool ret;
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
- if (debug_locks)
- lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
- return lockdep_opinion || !preemptible();
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ return lock_is_held(&rcu_sched_lock_map) || !preemptible();
}
EXPORT_SYMBOL(rcu_read_lock_sched_held);
#endif
@@ -136,8 +152,7 @@ static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1);
*/
bool rcu_gp_is_expedited(void)
{
- return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
- rcu_scheduler_active == RCU_SCHEDULER_INIT;
+ return rcu_expedited || atomic_read(&rcu_expedited_nesting);
}
EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
@@ -261,12 +276,10 @@ NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled);
*/
int rcu_read_lock_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return lock_is_held(&rcu_lock_map);
}
EXPORT_SYMBOL_GPL(rcu_read_lock_held);
@@ -288,16 +301,28 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_held);
*/
int rcu_read_lock_bh_held(void)
{
- if (!debug_lockdep_rcu_enabled())
- return 1;
- if (!rcu_is_watching())
- return 0;
- if (!rcu_lockdep_current_cpu_online())
- return 0;
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
+int rcu_read_lock_any_held(void)
+{
+ bool ret;
+
+ if (rcu_read_lock_held_common(&ret))
+ return ret;
+ if (lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map))
+ return 1;
+ return !preemptible();
+}
+EXPORT_SYMBOL_GPL(rcu_read_lock_any_held);
+
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/**
@@ -437,6 +462,8 @@ EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
#endif
#ifdef CONFIG_RCU_STALL_COMMON
+int rcu_cpu_stall_ftrace_dump __read_mostly;
+module_param(rcu_cpu_stall_ftrace_dump, int, 0644);
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
module_param(rcu_cpu_stall_suppress, int, 0644);
diff --git a/kernel/resource.c b/kernel/resource.c
index 158f04ec1d4f..74877e9d90ca 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL(release_resource);
*
* If a resource is found, returns 0 and @*res is overwritten with the part
* of the resource that's within [@start..@end]; if none is found, returns
- * -1 or -EINVAL for other invalid parameters.
+ * -ENODEV. Returns -EINVAL for invalid parameters.
*
* This function walks the whole tree and not just first level children
* unless @first_lvl is true.
@@ -342,6 +342,7 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
unsigned long flags, unsigned long desc,
bool first_lvl, struct resource *res)
{
+ bool siblings_only = true;
struct resource *p;
if (!res)
@@ -352,29 +353,43 @@ static int find_next_iomem_res(resource_size_t start, resource_size_t end,
read_lock(&resource_lock);
- for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
- if ((p->flags & flags) != flags)
- continue;
- if ((desc != IORES_DESC_NONE) && (desc != p->desc))
- continue;
+ for (p = iomem_resource.child; p; p = next_resource(p, siblings_only)) {
+ /* If we passed the resource we are looking for, stop */
if (p->start > end) {
p = NULL;
break;
}
- if ((p->end >= start) && (p->start <= end))
- break;
+
+ /* Skip until we find a range that matches what we look for */
+ if (p->end < start)
+ continue;
+
+ /*
+ * Now that we found a range that matches what we look for,
+ * check the flags and the descriptor. If we were not asked to
+ * use only the first level, start looking at children as well.
+ */
+ siblings_only = first_lvl;
+
+ if ((p->flags & flags) != flags)
+ continue;
+ if ((desc != IORES_DESC_NONE) && (desc != p->desc))
+ continue;
+
+ /* Found a match, break */
+ break;
+ }
+
+ if (p) {
+ /* copy data */
+ res->start = max(start, p->start);
+ res->end = min(end, p->end);
+ res->flags = p->flags;
+ res->desc = p->desc;
}
read_unlock(&resource_lock);
- if (!p)
- return -1;
-
- /* copy data */
- res->start = max(start, p->start);
- res->end = min(end, p->end);
- res->flags = p->flags;
- res->desc = p->desc;
- return 0;
+ return p ? 0 : -ENODEV;
}
static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
@@ -1628,6 +1643,62 @@ void resource_list_free(struct list_head *head)
}
EXPORT_SYMBOL(resource_list_free);
+#ifdef CONFIG_DEVICE_PRIVATE
+static struct resource *__request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size, const char *name)
+{
+ resource_size_t end, addr;
+ struct resource *res;
+
+ size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
+ end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
+ addr = end - size + 1UL;
+
+ for (; addr > size && addr >= base->start; addr -= size) {
+ if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
+ REGION_DISJOINT)
+ continue;
+
+ if (dev)
+ res = devm_request_mem_region(dev, addr, size, name);
+ else
+ res = request_mem_region(addr, size, name);
+ if (!res)
+ return ERR_PTR(-ENOMEM);
+ res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+ return res;
+ }
+
+ return ERR_PTR(-ERANGE);
+}
+
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+ struct resource *base, unsigned long size)
+{
+ return __request_free_mem_region(dev, base, size, dev_name(dev));
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+
+struct resource *request_free_mem_region(struct resource *base,
+ unsigned long size, const char *name)
+{
+ return __request_free_mem_region(NULL, base, size, name);
+}
+EXPORT_SYMBOL_GPL(request_free_mem_region);
+
+#endif /* CONFIG_DEVICE_PRIVATE */
+
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa43ce3962e7..f9a1346a5fa9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -255,7 +255,7 @@ static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
- hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
/*
@@ -314,7 +314,7 @@ void hrtick_start(struct rq *rq, u64 delay)
*/
delay = max_t(u64, delay, 10000LL);
hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
}
#endif /* CONFIG_SMP */
@@ -328,7 +328,7 @@ static void hrtick_rq_init(struct rq *rq)
rq->hrtick_csd.info = rq;
#endif
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
}
#else /* CONFIG_SCHED_HRTICK */
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
#ifdef CONFIG_UCLAMP_TASK
+/*
+ * Serializes updates of utilization clamp values
+ *
+ * The (slow-path) user-space triggers utilization clamp value updates which
+ * can require updates on (fast-path) scheduler's data structures used to
+ * support enqueue/dequeue operations.
+ * While the per-CPU rq lock protects fast-path update operations, user-space
+ * requests are serialized using a mutex to reduce the risk of conflicting
+ * updates or API abuses.
+ */
+static DEFINE_MUTEX(uclamp_mutex);
+
/* Max allowed minimum utilization */
unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
@@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
-static inline unsigned int uclamp_none(int clamp_id)
+static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
}
static inline unsigned int
-uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/*
@@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
return uclamp_none(UCLAMP_MIN);
}
-static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
/* Reset max-clamp retention only on idle exit */
@@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
}
static inline
-unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
- unsigned int clamp_value)
+enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
int bucket_id = UCLAMP_BUCKETS - 1;
@@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
return uclamp_idle_value(rq, clamp_id, clamp_value);
}
+static inline struct uclamp_se
+uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ struct uclamp_se uc_max;
+
+ /*
+ * Tasks in autogroups or root task group will be
+ * restricted by system defaults.
+ */
+ if (task_group_is_autogroup(task_group(p)))
+ return uc_req;
+ if (task_group(p) == &root_task_group)
+ return uc_req;
+
+ uc_max = task_group(p)->uclamp[clamp_id];
+ if (uc_req.value > uc_max.value || !uc_req.user_defined)
+ return uc_max;
+#endif
+
+ return uc_req;
+}
+
/*
* The effective clamp bucket index of a task depends on, by increasing
* priority:
* - the task specific clamp value, when explicitly requested from userspace
+ * - the task group effective clamp value, for tasks not either in the root
+ * group or in an autogroup
* - the system default clamp value, defined by the sysadmin
*/
static inline struct uclamp_se
-uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
{
- struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
struct uclamp_se uc_max = uclamp_default[clamp_id];
/* System default restrictions always apply */
@@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
return uc_req;
}
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
@@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
* for each bucket when all its RUNNABLE tasks require the same clamp.
*/
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
- unsigned int clamp_id)
+ enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
* enforce the expected state and warn.
*/
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
- unsigned int clamp_id)
+ enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
if (unlikely(!p->sched_class->uclamp_enabled))
return;
@@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void
+uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ /*
+ * Lock the task and the rq where the task is (or was) queued.
+ *
+ * We might lock the (previous) rq of a !RUNNABLE task, but that's the
+ * price to pay to safely serialize util_{min,max} updates with
+ * enqueues, dequeues and migration operations.
+ * This is the same locking schema used by __set_cpus_allowed_ptr().
+ */
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Setting the clamp bucket is serialized by task_rq_lock().
+ * If the task is not yet RUNNABLE and its task_struct is not
+ * affecting a valid clamp bucket, the next time it's enqueued,
+ * it will already see the updated clamp bucket value.
+ */
+ if (!p->uclamp[clamp_id].active) {
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+ }
+
+ task_rq_unlock(rq, p, &rf);
+}
+
+static inline void
+uclamp_update_active_tasks(struct cgroup_subsys_state *css,
+ unsigned int clamps)
+{
+ enum uclamp_id clamp_id;
+ struct css_task_iter it;
+ struct task_struct *p;
+
+ css_task_iter_start(css, 0, &it);
+ while ((p = css_task_iter_next(&it))) {
+ for_each_clamp_id(clamp_id) {
+ if ((0x1 << clamp_id) & clamps)
+ uclamp_update_active(p, clamp_id);
+ }
+ }
+ css_task_iter_end(&it);
+}
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css);
+static void uclamp_update_root_tg(void)
+{
+ struct task_group *tg = &root_task_group;
+
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+
+ rcu_read_lock();
+ cpu_util_update_eff(&root_task_group.css);
+ rcu_read_unlock();
+}
+#else
+static void uclamp_update_root_tg(void) { }
+#endif
+
int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
+ bool update_root_tg = false;
int old_min, old_max;
- static DEFINE_MUTEX(mutex);
int result;
- mutex_lock(&mutex);
+ mutex_lock(&uclamp_mutex);
old_min = sysctl_sched_uclamp_util_min;
old_max = sysctl_sched_uclamp_util_max;
@@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
if (old_min != sysctl_sched_uclamp_util_min) {
uclamp_se_set(&uclamp_default[UCLAMP_MIN],
sysctl_sched_uclamp_util_min, false);
+ update_root_tg = true;
}
if (old_max != sysctl_sched_uclamp_util_max) {
uclamp_se_set(&uclamp_default[UCLAMP_MAX],
sysctl_sched_uclamp_util_max, false);
+ update_root_tg = true;
}
+ if (update_root_tg)
+ uclamp_update_root_tg();
+
/*
- * Updating all the RUNNABLE task is expensive, keep it simple and do
- * just a lazy update at each next enqueue time.
+ * We update all RUNNABLE tasks only when task groups are in use.
+ * Otherwise, keep it simple and do just a lazy update at each next
+ * task enqueue time.
*/
+
goto done;
undo:
sysctl_sched_uclamp_util_min = old_min;
sysctl_sched_uclamp_util_max = old_max;
done:
- mutex_unlock(&mutex);
+ mutex_unlock(&uclamp_mutex);
return result;
}
@@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
/*
* On scheduling class change, reset to default clamps for tasks
@@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
static void uclamp_fork(struct task_struct *p)
{
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;
@@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)
static void __init init_uclamp(void)
{
struct uclamp_se uc_max = {};
- unsigned int clamp_id;
+ enum uclamp_id clamp_id;
int cpu;
+ mutex_init(&uclamp_mutex);
+
for_each_possible_cpu(cpu) {
memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)
/* System defaults allow max clamp values for both indexes */
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
- for_each_clamp_id(clamp_id)
+ for_each_clamp_id(clamp_id) {
uclamp_default[clamp_id] = uc_max;
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ root_task_group.uclamp_req[clamp_id] = uc_max;
+ root_task_group.uclamp[clamp_id] = uc_max;
+#endif
+ }
}
#else /* CONFIG_UCLAMP_TASK */
@@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
}
/*
@@ -2399,6 +2518,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
+ preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -2412,7 +2532,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* it disabling IRQs (this allows not taking ->pi_lock).
*/
if (!(p->state & state))
- return false;
+ goto out;
success = 1;
cpu = task_cpu(p);
@@ -2526,6 +2646,7 @@ unlock:
out:
if (success)
ttwu_stat(p, cpu, wake_flags);
+ preempt_enable();
return success;
}
@@ -3212,12 +3333,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
- struct mm_struct *mm, *oldmm;
-
prepare_task_switch(rq, prev, next);
- mm = next->mm;
- oldmm = prev->active_mm;
/*
* For paravirt, this is coupled with an exit in switch_to to
* combine the page table reload and the switch backend into
@@ -3226,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
arch_start_context_switch(prev);
/*
- * If mm is non-NULL, we pass through switch_mm(). If mm is
- * NULL, we will pass through mmdrop() in finish_task_switch().
- * Both of these contain the full memory barrier required by
- * membarrier after storing to rq->curr, before returning to
- * user-space.
+ * kernel -> kernel lazy + transfer active
+ * user -> kernel lazy + mmgrab() active
+ *
+ * kernel -> user switch + mmdrop() active
+ * user -> user switch
*/
- if (!mm) {
- next->active_mm = oldmm;
- mmgrab(oldmm);
- enter_lazy_tlb(oldmm, next);
- } else
- switch_mm_irqs_off(oldmm, mm, next);
+ if (!next->mm) { // to kernel
+ enter_lazy_tlb(prev->active_mm, next);
+
+ next->active_mm = prev->active_mm;
+ if (prev->mm) // from user
+ mmgrab(prev->active_mm);
+ else
+ prev->active_mm = NULL;
+ } else { // to user
+ /*
+ * sys_membarrier() requires an smp_mb() between setting
+ * rq->curr and returning to userspace.
+ *
+ * The below provides this either through switch_mm(), or in
+ * case 'prev->active_mm == next->mm' through
+ * finish_task_switch()'s mmdrop().
+ */
+
+ switch_mm_irqs_off(prev->active_mm, next->mm, next);
- if (!prev->mm) {
- prev->active_mm = NULL;
- rq->prev_mm = oldmm;
+ if (!prev->mm) { // from kernel
+ /* will mmdrop() in finish_task_switch(). */
+ rq->prev_mm = prev->active_mm;
+ prev->active_mm = NULL;
+ }
}
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3484,8 +3616,36 @@ void scheduler_tick(void)
struct tick_work {
int cpu;
+ atomic_t state;
struct delayed_work work;
};
+/* Values for ->state, see diagram below. */
+#define TICK_SCHED_REMOTE_OFFLINE 0
+#define TICK_SCHED_REMOTE_OFFLINING 1
+#define TICK_SCHED_REMOTE_RUNNING 2
+
+/*
+ * State diagram for ->state:
+ *
+ *
+ * TICK_SCHED_REMOTE_OFFLINE
+ * | ^
+ * | |
+ * | | sched_tick_remote()
+ * | |
+ * | |
+ * +--TICK_SCHED_REMOTE_OFFLINING
+ * | ^
+ * | |
+ * sched_tick_start() | | sched_tick_stop()
+ * | |
+ * V |
+ * TICK_SCHED_REMOTE_RUNNING
+ *
+ *
+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
+ * and sched_tick_start() are happy to leave the state in RUNNING.
+ */
static struct tick_work __percpu *tick_work_cpu;
@@ -3498,6 +3658,7 @@ static void sched_tick_remote(struct work_struct *work)
struct task_struct *curr;
struct rq_flags rf;
u64 delta;
+ int os;
/*
* Handle the tick only if it appears the remote CPU is running in full
@@ -3511,7 +3672,7 @@ static void sched_tick_remote(struct work_struct *work)
rq_lock_irq(rq, &rf);
curr = rq->curr;
- if (is_idle_task(curr))
+ if (is_idle_task(curr) || cpu_is_offline(cpu))
goto out_unlock;
update_rq_clock(rq);
@@ -3531,13 +3692,18 @@ out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
- * to keep scheduler internal stats reasonably up to date.
+ * to keep scheduler internal stats reasonably up to date. But
+ * first update state to reflect hotplug activity if required.
*/
- queue_delayed_work(system_unbound_wq, dwork, HZ);
+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
+ if (os == TICK_SCHED_REMOTE_RUNNING)
+ queue_delayed_work(system_unbound_wq, dwork, HZ);
}
static void sched_tick_start(int cpu)
{
+ int os;
struct tick_work *twork;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
@@ -3546,15 +3712,20 @@ static void sched_tick_start(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- twork->cpu = cpu;
- INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
- queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
+ if (os == TICK_SCHED_REMOTE_OFFLINE) {
+ twork->cpu = cpu;
+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+ queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)
{
struct tick_work *twork;
+ int os;
if (housekeeping_cpu(cpu, HK_FLAG_TICK))
return;
@@ -3562,7 +3733,10 @@ static void sched_tick_stop(int cpu)
WARN_ON_ONCE(!tick_work_cpu);
twork = per_cpu_ptr(tick_work_cpu, cpu);
- cancel_delayed_work_sync(&twork->work);
+ /* There cannot be competing actions, but don't rely on stop-machine. */
+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
+ /* Don't cancel, as this would mess up the state machine. */
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -3570,7 +3744,6 @@ int __init sched_tick_offload_init(void)
{
tick_work_cpu = alloc_percpu(struct tick_work);
BUG_ON(!tick_work_cpu);
-
return 0;
}
@@ -3579,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }
#endif
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_TRACE_PREEMPT_TOGGLE))
/*
* If the value passed in is equal to the current preempt count
@@ -3698,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev)
/*
* Various schedule()-time debugging checks and statistics:
*/
-static inline void schedule_debug(struct task_struct *prev)
+static inline void schedule_debug(struct task_struct *prev, bool preempt)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
#endif
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ if (!preempt && prev->state && prev->non_block_count) {
+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
+ prev->comm, prev->pid, prev->non_block_count);
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+ }
+#endif
+
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
@@ -3737,7 +3919,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
- goto again;
+ goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
@@ -3746,14 +3928,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
return p;
}
-again:
+restart:
+ /*
+ * Ensure that we put DL/RT tasks before the pick loop, such that they
+ * can PULL higher prio tasks when we lower the RQ 'priority'.
+ */
+ prev->sched_class->put_prev_task(rq, prev, rf);
+ if (!rq->nr_running)
+ newidle_balance(rq, rf);
+
for_each_class(class) {
- p = class->pick_next_task(rq, prev, rf);
- if (p) {
- if (unlikely(p == RETRY_TASK))
- goto again;
+ p = class->pick_next_task(rq, NULL, NULL);
+ if (p)
return p;
- }
}
/* The idle class should always have a runnable task: */
@@ -3780,7 +3967,7 @@ again:
* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
* called on the nearest possible occasion:
*
- * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
*
* - in syscall or exception context, at the next outmost
* preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3789,7 +3976,7 @@ again:
* - in IRQ context, return from interrupt-handler to
* preemptible context
*
- * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
* then at the next:
*
* - cond_resched() call
@@ -3811,7 +3998,7 @@ static void __sched notrace __schedule(bool preempt)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev);
+ schedule_debug(prev, preempt);
if (sched_feat(HRTICK))
hrtick_clear(rq);
@@ -3902,7 +4089,7 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
- if (!tsk->state || tsk_is_pi_blocked(tsk))
+ if (!tsk->state)
return;
/*
@@ -3918,6 +4105,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
preempt_enable_no_resched();
}
+ if (tsk_is_pi_blocked(tsk))
+ return;
+
/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
@@ -4031,7 +4221,7 @@ static void __sched notrace preempt_schedule_common(void)
} while (need_resched());
}
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* this is the entry point to schedule() from in-kernel preemption
* off of preempt_enable. Kernel preemptions off return from interrupt
@@ -4103,7 +4293,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
* this is the entry point to schedule() from kernel preemption
@@ -4271,7 +4461,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -4338,7 +4528,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_curr(rq);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
out_unlock:
task_rq_unlock(rq, p, &rf);
}
@@ -4655,6 +4845,9 @@ recheck:
return retval;
}
+ if (pi)
+ cpuset_read_lock();
+
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -4669,8 +4862,8 @@ recheck:
* Changing the policy of the stop threads its a very bad idea:
*/
if (p == rq->stop) {
- task_rq_unlock(rq, p, &rf);
- return -EINVAL;
+ retval = -EINVAL;
+ goto unlock;
}
/*
@@ -4688,8 +4881,8 @@ recheck:
goto change;
p->sched_reset_on_fork = reset_on_fork;
- task_rq_unlock(rq, p, &rf);
- return 0;
+ retval = 0;
+ goto unlock;
}
change:
@@ -4702,8 +4895,8 @@ change:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
#endif
#ifdef CONFIG_SMP
@@ -4718,8 +4911,8 @@ change:
*/
if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
- task_rq_unlock(rq, p, &rf);
- return -EPERM;
+ retval = -EPERM;
+ goto unlock;
}
}
#endif
@@ -4729,6 +4922,8 @@ change:
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
goto recheck;
}
@@ -4738,8 +4933,8 @@ change:
* is available.
*/
if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- task_rq_unlock(rq, p, &rf);
- return -EBUSY;
+ retval = -EBUSY;
+ goto unlock;
}
p->sched_reset_on_fork = reset_on_fork;
@@ -4781,7 +4976,7 @@ change:
enqueue_task(rq, p, queue_flags);
}
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
check_class_changed(rq, p, prev_class, oldprio);
@@ -4789,14 +4984,22 @@ change:
preempt_disable();
task_rq_unlock(rq, p, &rf);
- if (pi)
+ if (pi) {
+ cpuset_read_unlock();
rt_mutex_adjust_pi(p);
+ }
/* Run balance callbacks after we've adjusted the PI chain: */
balance_callback(rq);
preempt_enable();
return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (pi)
+ cpuset_read_unlock();
+ return retval;
}
static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4880,10 +5083,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setscheduler(p, policy, &lparam);
+ if (likely(p))
+ get_task_struct(p);
rcu_read_unlock();
+ if (likely(p)) {
+ retval = sched_setscheduler(p, policy, &lparam);
+ put_task_struct(p);
+ }
+
return retval;
}
@@ -5100,37 +5308,40 @@ out_unlock:
return retval;
}
-static int sched_read_attr(struct sched_attr __user *uattr,
- struct sched_attr *attr,
- unsigned int usize)
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
{
- int ret;
+ unsigned int ksize = sizeof(*kattr);
if (!access_ok(uattr, usize))
return -EFAULT;
/*
- * If we're handed a smaller struct than we know of,
- * ensure all the unknown bits are 0 - i.e. old
- * user-space does not get uncomplete information.
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
*/
- if (usize < sizeof(*attr)) {
- unsigned char *addr;
- unsigned char *end;
-
- addr = (void *)attr + usize;
- end = (void *)attr + sizeof(*attr);
-
- for (; addr < end; addr++) {
- if (*addr)
- return -EFBIG;
- }
-
- attr->size = usize;
- }
+ kattr->size = min(usize, ksize);
- ret = copy_to_user(uattr, attr, attr->size);
- if (ret)
+ if (copy_to_user(uattr, kattr, kattr->size))
return -EFAULT;
return 0;
@@ -5140,20 +5351,18 @@ static int sched_read_attr(struct sched_attr __user *uattr,
* sys_sched_getattr - similar to sched_getparam, but with sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
- * @size: sizeof(attr) for fwd/bwd comp.
+ * @usize: sizeof(attr) that user-space knows about, for forwards and backwards compatibility.
* @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size, unsigned int, flags)
+ unsigned int, usize, unsigned int, flags)
{
- struct sched_attr attr = {
- .size = sizeof(struct sched_attr),
- };
+ struct sched_attr kattr = { };
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0 || flags)
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
@@ -5166,25 +5375,24 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
if (retval)
goto out_unlock;
- attr.sched_policy = p->policy;
+ kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
if (task_has_dl_policy(p))
- __getparam_dl(p, &attr);
+ __getparam_dl(p, &kattr);
else if (task_has_rt_policy(p))
- attr.sched_priority = p->rt_priority;
+ kattr.sched_priority = p->rt_priority;
else
- attr.sched_nice = task_nice(p);
+ kattr.sched_nice = task_nice(p);
#ifdef CONFIG_UCLAMP_TASK
- attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
#endif
rcu_read_unlock();
- retval = sched_read_attr(uattr, &attr, size);
- return retval;
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
out_unlock:
rcu_read_unlock();
@@ -5414,7 +5622,7 @@ SYSCALL_DEFINE0(sched_yield)
return 0;
}
-#ifndef CONFIG_PREEMPT
+#ifndef CONFIG_PREEMPTION
int __sched _cond_resched(void)
{
if (should_resched(0)) {
@@ -5431,7 +5639,7 @@ EXPORT_SYMBOL(_cond_resched);
* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
* call schedule, and on return reacquire the lock.
*
- * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
@@ -5970,7 +6178,7 @@ void sched_setnuma(struct task_struct *p, int nid)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
- set_curr_task(rq, p);
+ set_next_task(rq, p);
task_rq_unlock(rq, p, &rf);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -6010,21 +6218,22 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks);
}
-static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+static struct task_struct *__pick_migrate_task(struct rq *rq)
{
-}
+ const struct sched_class *class;
+ struct task_struct *next;
-static const struct sched_class fake_sched_class = {
- .put_prev_task = put_prev_task_fake,
-};
+ for_each_class(class) {
+ next = class->pick_next_task(rq, NULL, NULL);
+ if (next) {
+ next->sched_class->put_prev_task(rq, next, NULL);
+ return next;
+ }
+ }
-static struct task_struct fake_task = {
- /*
- * Avoid pull_{rt,dl}_task()
- */
- .prio = MAX_PRIO + 1,
- .sched_class = &fake_sched_class,
-};
+ /* The idle class should always have a runnable task */
+ BUG();
+}
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -6067,12 +6276,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
if (rq->nr_running == 1)
break;
- /*
- * pick_next_task() assumes pinned rq->lock:
- */
- next = pick_next_task(rq, &fake_task, rf);
- BUG_ON(!next);
- put_prev_task(rq, next);
+ next = __pick_migrate_task(rq);
/*
* Rules for changing task_struct::cpus_mask are holding
@@ -6369,19 +6573,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void)
{
- unsigned long alloc_size = 0, ptr;
+ unsigned long ptr = 0;
int i;
wait_bit_init();
#ifdef CONFIG_FAIR_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
- alloc_size += 2 * nr_cpu_ids * sizeof(void **);
+ ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif
- if (alloc_size) {
- ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
+ if (ptr) {
+ ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
#ifdef CONFIG_FAIR_GROUP_SCHED
root_task_group.se = (struct sched_entity **)ptr;
@@ -6568,7 +6772,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
rcu_sleep_check();
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
+ !is_idle_task(current) && !current->non_block_count) ||
system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
oops_in_progress)
return;
@@ -6584,8 +6788,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
"BUG: sleeping function called from invalid context at %s:%d\n",
file, line);
printk(KERN_ERR
- "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
- in_atomic(), irqs_disabled(),
+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
current->pid, current->comm);
if (task_stack_end_corrupted(current))
@@ -6700,7 +6904,7 @@ struct task_struct *curr_task(int cpu)
#ifdef CONFIG_IA64
/**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
* @cpu: the processor in question.
* @p: the task pointer to set.
*
@@ -6725,6 +6929,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);
+static inline void alloc_uclamp_sched_group(struct task_group *tg,
+ struct task_group *parent)
+{
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&tg->uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
+ }
+#endif
+}
+
static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
@@ -6748,6 +6966,8 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ alloc_uclamp_sched_group(tg, parent);
+
return tg;
err:
@@ -6851,7 +7071,7 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running)
- set_curr_task(rq, tsk);
+ set_next_task(rq, tsk);
task_rq_unlock(rq, tsk, &rf);
}
@@ -6934,10 +7154,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
-#else
- /* We don't support RT-tasks being in separate groups */
- if (task->sched_class != &fair_sched_class)
- return -EINVAL;
#endif
/*
* Serialize against wake_up_new_task() such that if its
@@ -6968,6 +7184,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
sched_move_task(task);
}
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+static void cpu_util_update_eff(struct cgroup_subsys_state *css)
+{
+ struct cgroup_subsys_state *top_css = css;
+ struct uclamp_se *uc_parent = NULL;
+ struct uclamp_se *uc_se = NULL;
+ unsigned int eff[UCLAMP_CNT];
+ enum uclamp_id clamp_id;
+ unsigned int clamps;
+
+ css_for_each_descendant_pre(css, top_css) {
+ uc_parent = css_tg(css)->parent
+ ? css_tg(css)->parent->uclamp : NULL;
+
+ for_each_clamp_id(clamp_id) {
+ /* Assume effective clamps matches requested clamps */
+ eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
+ /* Cap effective clamps with parent's effective clamps */
+ if (uc_parent &&
+ eff[clamp_id] > uc_parent[clamp_id].value) {
+ eff[clamp_id] = uc_parent[clamp_id].value;
+ }
+ }
+ /* Ensure protection is always capped by limit */
+ eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
+
+ /* Propagate most restrictive effective clamps */
+ clamps = 0x0;
+ uc_se = css_tg(css)->uclamp;
+ for_each_clamp_id(clamp_id) {
+ if (eff[clamp_id] == uc_se[clamp_id].value)
+ continue;
+ uc_se[clamp_id].value = eff[clamp_id];
+ uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
+ clamps |= (0x1 << clamp_id);
+ }
+ if (!clamps) {
+ css = css_rightmost_descendant(css);
+ continue;
+ }
+
+ /* Immediately update descendants RUNNABLE tasks */
+ uclamp_update_active_tasks(css, clamps);
+ }
+}
+
+/*
+ * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
+ * C expression. Since there is no way to convert a macro argument (N) into a
+ * character constant, use two levels of macros.
+ */
+#define _POW10(exp) ((unsigned int)1e##exp)
+#define POW10(exp) _POW10(exp)
+
+struct uclamp_request {
+#define UCLAMP_PERCENT_SHIFT 2
+#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
+ s64 percent;
+ u64 util;
+ int ret;
+};
+
+static inline struct uclamp_request
+capacity_from_percent(char *buf)
+{
+ struct uclamp_request req = {
+ .percent = UCLAMP_PERCENT_SCALE,
+ .util = SCHED_CAPACITY_SCALE,
+ .ret = 0,
+ };
+
+ buf = strim(buf);
+ if (strcmp(buf, "max")) {
+ req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
+ &req.percent);
+ if (req.ret)
+ return req;
+ if (req.percent > UCLAMP_PERCENT_SCALE) {
+ req.ret = -ERANGE;
+ return req;
+ }
+
+ req.util = req.percent << SCHED_CAPACITY_SHIFT;
+ req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
+ }
+
+ return req;
+}
+
+static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off,
+ enum uclamp_id clamp_id)
+{
+ struct uclamp_request req;
+ struct task_group *tg;
+
+ req = capacity_from_percent(buf);
+ if (req.ret)
+ return req.ret;
+
+ mutex_lock(&uclamp_mutex);
+ rcu_read_lock();
+
+ tg = css_tg(of_css(of));
+ if (tg->uclamp_req[clamp_id].value != req.util)
+ uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
+
+ /*
+ * Because of not recoverable conversion rounding we keep track of the
+ * exact requested value
+ */
+ tg->uclamp_pct[clamp_id] = req.percent;
+
+ /* Update effective clamps to track the most restrictive value */
+ cpu_util_update_eff(of_css(of));
+
+ rcu_read_unlock();
+ mutex_unlock(&uclamp_mutex);
+
+ return nbytes;
+}
+
+static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
+}
+
+static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
+}
+
+static inline void cpu_uclamp_print(struct seq_file *sf,
+ enum uclamp_id clamp_id)
+{
+ struct task_group *tg;
+ u64 util_clamp;
+ u64 percent;
+ u32 rem;
+
+ rcu_read_lock();
+ tg = css_tg(seq_css(sf));
+ util_clamp = tg->uclamp_req[clamp_id].value;
+ rcu_read_unlock();
+
+ if (util_clamp == SCHED_CAPACITY_SCALE) {
+ seq_puts(sf, "max\n");
+ return;
+ }
+
+ percent = tg->uclamp_pct[clamp_id];
+ percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
+ seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
+}
+
+static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MIN);
+ return 0;
+}
+
+static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
+{
+ cpu_uclamp_print(sf, UCLAMP_MAX);
+ return 0;
+}
+#endif /* CONFIG_UCLAMP_TASK_GROUP */
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
@@ -7313,6 +7701,20 @@ static struct cftype cpu_legacy_files[] = {
.write_u64 = cpu_rt_period_write_uint,
},
#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif
{ } /* Terminate */
};
@@ -7480,6 +7882,20 @@ static struct cftype cpu_files[] = {
.write = cpu_max_write,
},
#endif
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ {
+ .name = "uclamp.min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_min_show,
+ .write = cpu_uclamp_min_write,
+ },
+ {
+ .name = "uclamp.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_uclamp_max_show,
+ .write = cpu_uclamp_max_write,
+ },
+#endif
{ } /* terminate */
};
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 636ca6f88c8e..86800b4d5453 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -40,6 +40,7 @@ struct sugov_policy {
struct task_struct *thread;
bool work_in_progress;
+ bool limits_changed;
bool need_freq_update;
};
@@ -89,8 +90,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
- if (unlikely(sg_policy->need_freq_update))
+ if (unlikely(sg_policy->limits_changed)) {
+ sg_policy->limits_changed = false;
+ sg_policy->need_freq_update = true;
return true;
+ }
delta_ns = time - sg_policy->last_freq_update_time;
@@ -113,6 +117,7 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
struct cpufreq_policy *policy = sg_policy->policy;
+ int cpu;
if (!sugov_update_next_freq(sg_policy, time, next_freq))
return;
@@ -122,7 +127,11 @@ static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
return;
policy->cur = next_freq;
- trace_cpu_frequency(next_freq, smp_processor_id());
+
+ if (trace_cpu_frequency_enabled()) {
+ for_each_cpu(cpu, policy->cpus)
+ trace_cpu_frequency(next_freq, cpu);
+ }
}
static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
@@ -259,9 +268,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
* irq metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
- * 1 - irq
- * U' = irq + ------- * U
- * max
+ * max - irq
+ * U' = irq + --------- * U
+ * max
*/
util = scale_irq_capacity(util, irq, max);
util += irq;
@@ -437,7 +446,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
- sg_policy->need_freq_update = true;
+ sg_policy->limits_changed = true;
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -457,7 +466,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
- busy = sugov_cpu_is_busy(sg_cpu);
+ /* Limits may have changed, don't skip frequency update */
+ busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
@@ -831,6 +841,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
+ sg_policy->limits_changed = false;
sg_policy->need_freq_update = false;
sg_policy->cached_raw_freq = 0;
@@ -879,7 +890,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
mutex_unlock(&sg_policy->work_lock);
}
- sg_policy->need_freq_update = true;
+ sg_policy->limits_changed = true;
}
struct cpufreq_governor schedutil_gov = {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ef5b9f6b1d42..2dc48720f189 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -287,7 +287,7 @@ static void task_non_contending(struct task_struct *p)
dl_se->dl_non_contending = 1;
get_task_struct(p);
- hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
+ hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL_HARD);
}
static void task_contending(struct sched_dl_entity *dl_se, int flags)
@@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
+ struct dl_bw *dl_b;
later_rq = find_lock_later_rq(p, rq);
if (!later_rq) {
@@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
double_lock_balance(rq, later_rq);
}
+ if (p->dl.dl_non_contending || p->dl.dl_throttled) {
+ /*
+ * Inactive timer is armed (or callback is running, but
+ * waiting for us to release rq locks). In any case, when it
+ * will fire (or continue), it will see running_bw of this
+ * task migrated to later_rq (and correctly handle it).
+ */
+ sub_running_bw(&p->dl, &rq->dl);
+ sub_rq_bw(&p->dl, &rq->dl);
+
+ add_rq_bw(&p->dl, &later_rq->dl);
+ add_running_bw(&p->dl, &later_rq->dl);
+ } else {
+ sub_rq_bw(&p->dl, &rq->dl);
+ add_rq_bw(&p->dl, &later_rq->dl);
+ }
+
+ /*
+ * And we finally need to fixup root_domain(s) bandwidth accounting,
+ * since p is still hanging out in the old (now moved to default) root
+ * domain.
+ */
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
+ dl_b = &later_rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
+ raw_spin_unlock(&dl_b->lock);
+
set_task_cpu(p, later_rq->cpu);
double_unlock_balance(later_rq, rq);
@@ -923,7 +956,7 @@ static int start_dl_timer(struct task_struct *p)
*/
if (!hrtimer_is_queued(timer)) {
get_task_struct(p);
- hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS_HARD);
}
return 1;
@@ -1053,7 +1086,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
timer->function = dl_task_timer;
}
@@ -1292,7 +1325,7 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
timer->function = inactive_task_timer;
}
@@ -1694,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, p);
+
+ if (rq->curr->sched_class != &dl_sched_class)
+ update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ deadline_queue_push_tasks(rq);
}
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1720,64 +1761,42 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
struct dl_rq *dl_rq;
- dl_rq = &rq->dl;
+ WARN_ON_ONCE(prev || rf);
- if (need_pull_dl_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_dl_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_dl_task() can drop (and re-acquire) rq->lock; this
- * means a stop task can slip in, in which case we need to
- * re-start task selection.
- */
- if (rq->stop && task_on_rq_queued(rq->stop))
- return RETRY_TASK;
- }
-
- /*
- * When prev is DL, we may throttle it in put_prev_task().
- * So, we update time before we check for dl_nr_running.
- */
- if (prev->sched_class == &dl_sched_class)
- update_curr_dl(rq);
+ dl_rq = &rq->dl;
if (unlikely(!dl_rq->dl_nr_running))
return NULL;
- put_prev_task(rq, prev);
-
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
- set_next_task(rq, p);
-
- if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
-
- deadline_queue_push_tasks(rq);
-
- if (rq->curr->sched_class != &dl_sched_class)
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ set_next_task_dl(rq, p);
return p;
}
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
update_curr_dl(rq);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
+
+ if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_dl_task(rq);
+ rq_repin_lock(rq, rf);
+ }
}
/*
@@ -1811,11 +1830,6 @@ static void task_fork_dl(struct task_struct *p)
*/
}
-static void set_curr_task_dl(struct rq *rq)
-{
- set_next_task(rq, rq->curr);
-}
-
#ifdef CONFIG_SMP
/* Only try algorithms three times */
@@ -2088,17 +2102,13 @@ retry:
}
deactivate_task(rq, next_task, 0);
- sub_running_bw(&next_task->dl, &rq->dl);
- sub_rq_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
- add_rq_bw(&next_task->dl, &later_rq->dl);
/*
* Update the later_rq clock here, because the clock is used
* by the cpufreq_update_util() inside __add_running_bw().
*/
update_rq_clock(later_rq);
- add_running_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
ret = 1;
@@ -2186,11 +2196,7 @@ static void pull_dl_task(struct rq *this_rq)
resched = true;
deactivate_task(src_rq, p, 0);
- sub_running_bw(&p->dl, &src_rq->dl);
- sub_rq_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
- add_rq_bw(&p->dl, &this_rq->dl);
- add_running_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -2283,6 +2289,36 @@ void __init init_sched_dl_class(void)
GFP_KERNEL, cpu_to_node(i));
}
+void dl_add_task_root_domain(struct task_struct *p)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+ struct dl_bw *dl_b;
+
+ rq = task_rq_lock(p, &rf);
+ if (!dl_task(p))
+ goto unlock;
+
+ dl_b = &rq->rd->dl_bw;
+ raw_spin_lock(&dl_b->lock);
+
+ __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
+
+ raw_spin_unlock(&dl_b->lock);
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+}
+
+void dl_clear_root_domain(struct root_domain *rd)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+ rd->dl_bw.total_bw = 0;
+ raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+}
+
#endif /* CONFIG_SMP */
static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -2403,6 +2439,7 @@ const struct sched_class dl_sched_class = {
.pick_next_task = pick_next_task_dl,
.put_prev_task = put_prev_task_dl,
+ .set_next_task = set_next_task_dl,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
@@ -2413,7 +2450,6 @@ const struct sched_class dl_sched_class = {
.task_woken = task_woken_dl,
#endif
- .set_curr_task = set_curr_task_dl,
.task_tick = task_tick_dl,
.task_fork = task_fork_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 036be95a87e9..d4bbf68c3161 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu)
}
/*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
+ * The margin used when comparing utilization with CPU capacity.
*
* (default: ~20%)
*/
-static unsigned int capacity_margin = 1280;
+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
+
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -1086,6 +1086,21 @@ struct numa_group {
unsigned long faults[0];
};
+/*
+ * For functions that can be called in multiple contexts that permit reading
+ * ->numa_group (see struct task_struct for locking rules).
+ */
+static struct numa_group *deref_task_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_check(p->numa_group, p == current ||
+ (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
+}
+
+static struct numa_group *deref_curr_numa_group(struct task_struct *p)
+{
+ return rcu_dereference_protected(p->numa_group, p == current);
+}
+
static inline unsigned long group_faults_priv(struct numa_group *ng);
static inline unsigned long group_faults_shared(struct numa_group *ng);
@@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long period = smin;
+ struct numa_group *ng;
/* Scale the maximum scan period with the amount of shared memory. */
- if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
@@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p)
period *= shared + 1;
period /= private + shared + 1;
}
+ rcu_read_unlock();
return max(smin, period);
}
@@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p)
{
unsigned long smin = task_scan_min(p);
unsigned long smax;
+ struct numa_group *ng;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
/* Scale the maximum scan period with the amount of shared memory. */
- if (p->numa_group) {
- struct numa_group *ng = p->numa_group;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
unsigned long shared = group_faults_shared(ng);
unsigned long private = group_faults_priv(ng);
unsigned long period = smax;
@@ -1169,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax);
}
-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
-{
- int mm_users = 0;
- struct mm_struct *mm = p->mm;
-
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1) {
- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
- mm->numa_scan_seq = 0;
- }
- }
- p->node_stamp = 0;
- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->numa_group = NULL;
- p->last_task_numa_placement = 0;
- p->last_sum_exec_runtime = 0;
-
- /* New address space, reset the preferred nid */
- if (!(clone_flags & CLONE_VM)) {
- p->numa_preferred_nid = NUMA_NO_NODE;
- return;
- }
-
- /*
- * New thread, keep existing numa_preferred_nid which should be copied
- * already by arch_dup_task_struct but stagger when scans start.
- */
- if (mm) {
- unsigned int delay;
-
- delay = min_t(unsigned int, task_scan_max(current),
- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
- delay += 2 * TICK_NSEC;
- p->node_stamp = delay;
- }
-}
-
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{
rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@ -1233,7 +1211,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
pid_t task_numa_group_id(struct task_struct *p)
{
- return p->numa_group ? p->numa_group->gid : 0;
+ struct numa_group *ng;
+ pid_t gid = 0;
+
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
+ if (ng)
+ gid = ng->gid;
+ rcu_read_unlock();
+
+ return gid;
}
/*
@@ -1258,11 +1245,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
static inline unsigned long group_faults(struct task_struct *p, int nid)
{
- if (!p->numa_group)
+ struct numa_group *ng = deref_task_numa_group(p);
+
+ if (!ng)
return 0;
- return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
- p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
+ return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
@@ -1400,12 +1389,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
static inline unsigned long group_weight(struct task_struct *p, int nid,
int dist)
{
+ struct numa_group *ng = deref_task_numa_group(p);
unsigned long faults, total_faults;
- if (!p->numa_group)
+ if (!ng)
return 0;
- total_faults = p->numa_group->total_faults;
+ total_faults = ng->total_faults;
if (!total_faults)
return 0;
@@ -1419,7 +1409,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
- struct numa_group *ng = p->numa_group;
+ struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
@@ -1600,13 +1590,14 @@ static bool load_too_imbalanced(long src_load, long dst_load,
static void task_numa_compare(struct task_numa_env *env,
long taskimp, long groupimp, bool maymove)
{
+ struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
+ long imp = p_ng ? groupimp : taskimp;
struct task_struct *cur;
long src_load, dst_load;
- long load;
- long imp = env->p->numa_group ? groupimp : taskimp;
- long moveimp = imp;
int dist = env->dist;
+ long moveimp = imp;
+ long load;
if (READ_ONCE(dst_rq->numa_migrate_on))
return;
@@ -1645,21 +1636,22 @@ static void task_numa_compare(struct task_numa_env *env,
* If dst and source tasks are in the same NUMA group, or not
* in any group then look only at task weights.
*/
- if (cur->numa_group == env->p->numa_group) {
+ cur_ng = rcu_dereference(cur->numa_group);
+ if (cur_ng == p_ng) {
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
*/
- if (cur->numa_group)
+ if (cur_ng)
imp -= imp / 16;
} else {
/*
* Compare the group weights. If a task is all by itself
* (not part of a group), use the task weight instead.
*/
- if (cur->numa_group && env->p->numa_group)
+ if (cur_ng && p_ng)
imp += group_weight(cur, env->src_nid, dist) -
group_weight(cur, env->dst_nid, dist);
else
@@ -1757,11 +1749,12 @@ static int task_numa_migrate(struct task_struct *p)
.best_imp = 0,
.best_cpu = -1,
};
+ unsigned long taskweight, groupweight;
struct sched_domain *sd;
+ long taskimp, groupimp;
+ struct numa_group *ng;
struct rq *best_rq;
- unsigned long taskweight, groupweight;
int nid, ret, dist;
- long taskimp, groupimp;
/*
* Pick the lowest SD_NUMA domain, as that would have the smallest
@@ -1807,7 +1800,8 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
- if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
+ ng = deref_curr_numa_group(p);
+ if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1840,7 +1834,7 @@ static int task_numa_migrate(struct task_struct *p)
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
- if (p->numa_group) {
+ if (ng) {
if (env.best_cpu == -1)
nid = env.src_nid;
else
@@ -2135,6 +2129,7 @@ static void task_numa_placement(struct task_struct *p)
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
+ struct numa_group *ng;
/*
* The p->mm->numa_scan_seq field gets updated without
@@ -2152,8 +2147,9 @@ static void task_numa_placement(struct task_struct *p)
runtime = numa_get_avg_runtime(p, &period);
/* If the task is part of a group prevent parallel updates to group stats */
- if (p->numa_group) {
- group_lock = &p->numa_group->lock;
+ ng = deref_curr_numa_group(p);
+ if (ng) {
+ group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
@@ -2194,7 +2190,7 @@ static void task_numa_placement(struct task_struct *p)
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
- if (p->numa_group) {
+ if (ng) {
/*
* safe because we can only change our own group
*
@@ -2202,14 +2198,14 @@ static void task_numa_placement(struct task_struct *p)
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
- p->numa_group->faults[mem_idx] += diff;
- p->numa_group->faults_cpu[mem_idx] += f_diff;
- p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[mem_idx];
+ ng->faults[mem_idx] += diff;
+ ng->faults_cpu[mem_idx] += f_diff;
+ ng->total_faults += diff;
+ group_faults += ng->faults[mem_idx];
}
}
- if (!p->numa_group) {
+ if (!ng) {
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
@@ -2220,8 +2216,8 @@ static void task_numa_placement(struct task_struct *p)
}
}
- if (p->numa_group) {
- numa_group_count_active_nodes(p->numa_group);
+ if (ng) {
+ numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_nid);
}
@@ -2255,7 +2251,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
int cpu = cpupid_to_cpu(cpupid);
int i;
- if (unlikely(!p->numa_group)) {
+ if (unlikely(!deref_curr_numa_group(p))) {
unsigned int size = sizeof(struct numa_group) +
4*nr_node_ids*sizeof(unsigned long);
@@ -2291,7 +2287,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (!grp)
goto no_join;
- my_grp = p->numa_group;
+ my_grp = deref_curr_numa_group(p);
if (grp == my_grp)
goto no_join;
@@ -2353,13 +2349,24 @@ no_join:
return;
}
-void task_numa_free(struct task_struct *p)
+/*
+ * Get rid of NUMA staticstics associated with a task (either current or dead).
+ * If @final is set, the task is dead and has reached refcount zero, so we can
+ * safely free all relevant data structures. Otherwise, there might be
+ * concurrent reads from places like load balancing and procfs, and we should
+ * reset the data back to default state without freeing ->numa_faults.
+ */
+void task_numa_free(struct task_struct *p, bool final)
{
- struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults;
+ /* safe: p either is current or is being freed by current */
+ struct numa_group *grp = rcu_dereference_raw(p->numa_group);
+ unsigned long *numa_faults = p->numa_faults;
unsigned long flags;
int i;
+ if (!numa_faults)
+ return;
+
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
@@ -2372,8 +2379,14 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp);
}
- p->numa_faults = NULL;
- kfree(numa_faults);
+ if (final) {
+ p->numa_faults = NULL;
+ kfree(numa_faults);
+ } else {
+ p->total_numa_faults = 0;
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ numa_faults[i] = 0;
+ }
}
/*
@@ -2426,7 +2439,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
- ng = p->numa_group;
+ ng = deref_curr_numa_group(p);
if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
@@ -2469,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
-void task_numa_work(struct callback_head *work)
+static void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
@@ -2482,7 +2495,7 @@ void task_numa_work(struct callback_head *work)
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
- work->next = work; /* protect against double add */
+ work->next = work;
/*
* Who cares about NUMA placement when they're dying.
*
@@ -2611,6 +2624,50 @@ out:
}
}
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+ int mm_users = 0;
+ struct mm_struct *mm = p->mm;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1) {
+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+ mm->numa_scan_seq = 0;
+ }
+ }
+ p->node_stamp = 0;
+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ /* Protect against double add, see task_tick_numa and task_numa_work */
+ p->numa_work.next = &p->numa_work;
+ p->numa_faults = NULL;
+ RCU_INIT_POINTER(p->numa_group, NULL);
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
+
+ init_task_work(&p->numa_work, task_numa_work);
+
+ /* New address space, reset the preferred nid */
+ if (!(clone_flags & CLONE_VM)) {
+ p->numa_preferred_nid = NUMA_NO_NODE;
+ return;
+ }
+
+ /*
+ * New thread, keep existing numa_preferred_nid which should be copied
+ * already by arch_dup_task_struct but stagger when scans start.
+ */
+ if (mm) {
+ unsigned int delay;
+
+ delay = min_t(unsigned int, task_scan_max(current),
+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+ delay += 2 * TICK_NSEC;
+ p->node_stamp = delay;
+ }
+}
+
/*
* Drive the periodic memory faults..
*/
@@ -2639,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ if (!time_before(jiffies, curr->mm->numa_next_scan))
task_work_add(curr, work, true);
- }
}
}
@@ -3635,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
return cfs_rq->avg.load_avg;
}
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
-
static inline unsigned long task_util(struct task_struct *p)
{
return READ_ONCE(p->se.avg.util_avg);
@@ -3753,7 +3806,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
- return capacity * 1024 > task_util_est(p) * capacity_margin;
+ return fits_capacity(task_util_est(p), capacity);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4316,8 +4369,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
- cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4339,8 +4390,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount, expires;
- int expires_seq;
+ u64 amount = 0, min_amount;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4357,65 +4407,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- expires_seq = cfs_b->expires_seq;
- expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
- /*
- * we may have advanced our local expiration to account for allowed
- * spread between our sched_clock and the one on which runtime was
- * issued.
- */
- if (cfs_rq->expires_seq != expires_seq) {
- cfs_rq->expires_seq = expires_seq;
- cfs_rq->runtime_expires = expires;
- }
return cfs_rq->runtime_remaining > 0;
}
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- /* if the deadline is ahead of our clock, nothing to do */
- if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
- return;
-
- if (cfs_rq->runtime_remaining < 0)
- return;
-
- /*
- * If the local deadline has passed we have to consider the
- * possibility that our sched_clock is 'fast' and the global deadline
- * has not truly expired.
- *
- * Fortunately we can check determine whether this the case by checking
- * whether the global deadline(cfs_b->expires_seq) has advanced.
- */
- if (cfs_rq->expires_seq == cfs_b->expires_seq) {
- /* extend local deadline, drift is bounded above by 2 ticks */
- cfs_rq->runtime_expires += TICK_NSEC;
- } else {
- /* global deadline is ahead, expiration has passed */
- cfs_rq->runtime_remaining = 0;
- }
-}
-
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- expire_cfs_rq_runtime(cfs_rq);
if (likely(cfs_rq->runtime_remaining > 0))
return;
+ if (cfs_rq->throttled)
+ return;
/*
* if we're unable to extend our runtime we resched so that the active
* hierarchy can be throttled
@@ -4500,7 +4508,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, dequeue = 1;
+ long task_delta, idle_task_delta, dequeue = 1;
bool empty;
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4511,6 +4519,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
@@ -4520,6 +4529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
if (qcfs_rq->load.weight)
dequeue = 0;
@@ -4559,7 +4569,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
int enqueue = 1;
- long task_delta;
+ long task_delta, idle_task_delta;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -4579,6 +4589,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
return;
task_delta = cfs_rq->h_nr_running;
+ idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
if (se->on_rq)
enqueue = 0;
@@ -4587,6 +4598,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
cfs_rq->h_nr_running += task_delta;
+ cfs_rq->idle_h_nr_running += idle_task_delta;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -4602,8 +4614,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
resched_curr(rq);
}
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
- u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
{
struct cfs_rq *cfs_rq;
u64 runtime;
@@ -4619,13 +4630,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
if (!cfs_rq_throttled(cfs_rq))
goto next;
+ /* By the above check, this should never be true */
+ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+
runtime = -cfs_rq->runtime_remaining + 1;
if (runtime > remaining)
runtime = remaining;
remaining -= runtime;
cfs_rq->runtime_remaining += runtime;
- cfs_rq->runtime_expires = expires;
/* we check whether we're throttled above */
if (cfs_rq->runtime_remaining > 0)
@@ -4650,7 +4663,7 @@ next:
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
{
- u64 runtime, runtime_expires;
+ u64 runtime;
int throttled;
/* no need to continue the timer with no bandwidth constraint */
@@ -4678,8 +4691,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- runtime_expires = cfs_b->runtime_expires;
-
/*
* This check is repeated as we are holding onto the new bandwidth while
* we unthrottle. This can potentially race with an unthrottled group
@@ -4692,8 +4703,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
cfs_b->distribute_running = 1;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- runtime = distribute_cfs_runtime(cfs_b, runtime,
- runtime_expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
cfs_b->distribute_running = 0;
@@ -4775,8 +4785,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
return;
raw_spin_lock(&cfs_b->lock);
- if (cfs_b->quota != RUNTIME_INF &&
- cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ if (cfs_b->quota != RUNTIME_INF) {
cfs_b->runtime += slack_runtime;
/* we are under rq->lock, defer unthrottling using a timer */
@@ -4809,7 +4818,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
{
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
unsigned long flags;
- u64 expires;
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -4827,7 +4835,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- expires = cfs_b->runtime_expires;
if (runtime)
cfs_b->distribute_running = 1;
@@ -4836,11 +4843,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
if (!runtime)
return;
- runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+ runtime = distribute_cfs_runtime(cfs_b, runtime);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
- if (expires == cfs_b->runtime_expires)
- lsub_positive(&cfs_b->runtime, runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
}
@@ -4997,8 +5003,6 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->period_active = 1;
overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
- cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
- cfs_b->expires_seq++;
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
@@ -5176,7 +5180,7 @@ static inline unsigned long cpu_util(int cpu);
static inline bool cpu_overutilized(int cpu)
{
- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+ return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
}
static inline void update_overutilized_status(struct rq *rq)
@@ -5200,6 +5204,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int idle_h_nr_running = task_has_idle_policy(p);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5232,6 +5237,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
flags = ENQUEUE_WAKEUP;
}
@@ -5239,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5300,6 +5307,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
+ int idle_h_nr_running = task_has_idle_policy(p);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
@@ -5314,6 +5322,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -5333,6 +5342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
if (cfs_rq_throttled(cfs_rq))
break;
@@ -5366,6 +5376,15 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
+/* CPU only has SCHED_IDLE tasks enqueued */
+static int sched_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ rq->nr_running);
+}
+
static unsigned long cpu_runnable_load(struct rq *rq)
{
return cfs_rq_runnable_load_avg(&rq->cfs);
@@ -5688,7 +5707,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1;
+ int shallowest_idle_cpu = -1, si_cpu = -1;
int i;
/* Check if we have any choice: */
@@ -5719,7 +5738,12 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else if (shallowest_idle_cpu == -1) {
+ } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
+ if (sched_idle_cpu(i)) {
+ si_cpu = i;
+ continue;
+ }
+
load = cpu_runnable_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
@@ -5728,7 +5752,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
}
}
- return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ if (shallowest_idle_cpu != -1)
+ return shallowest_idle_cpu;
+ if (si_cpu != -1)
+ return si_cpu;
+ return least_loaded_cpu;
}
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5881,7 +5909,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
*/
static int select_idle_smt(struct task_struct *p, int target)
{
- int cpu;
+ int cpu, si_cpu = -1;
if (!static_branch_likely(&sched_smt_present))
return -1;
@@ -5891,9 +5919,11 @@ static int select_idle_smt(struct task_struct *p, int target)
continue;
if (available_idle_cpu(cpu))
return cpu;
+ if (si_cpu == -1 && sched_idle_cpu(cpu))
+ si_cpu = cpu;
}
- return -1;
+ return si_cpu;
}
#else /* CONFIG_SCHED_SMT */
@@ -5921,8 +5951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle;
u64 time, cost;
s64 delta;
- int cpu, nr = INT_MAX;
int this = smp_processor_id();
+ int cpu, nr = INT_MAX, si_cpu = -1;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5950,11 +5980,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!--nr)
- return -1;
+ return si_cpu;
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
break;
+ if (si_cpu == -1 && sched_idle_cpu(cpu))
+ si_cpu = cpu;
}
time = cpu_clock(this) - time;
@@ -5973,13 +6005,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd;
int i, recent_used_cpu;
- if (available_idle_cpu(target))
+ if (available_idle_cpu(target) || sched_idle_cpu(target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+ if (prev != target && cpus_share_cache(prev, target) &&
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)))
return prev;
/* Check a recently used CPU as a potential idle candidate: */
@@ -5987,7 +6020,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
- available_idle_cpu(recent_used_cpu) &&
+ (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
* Replace recent_used_cpu with prev as it is a potential
@@ -6223,69 +6256,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
}
/*
- * compute_energy(): Estimates the energy that would be consumed if @p was
+ * compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
- unsigned long sum_util, energy = 0;
- struct task_struct *tsk;
+ struct cpumask *pd_mask = perf_domain_span(pd);
+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+ unsigned long max_util = 0, sum_util = 0;
int cpu;
- for (; pd; pd = pd->next) {
- struct cpumask *pd_mask = perf_domain_span(pd);
+ /*
+ * The capacity state of CPUs of the current rd can be driven by CPUs
+ * of another rd if they belong to the same pd. So, account for the
+ * utilization of these CPUs too by masking pd with cpu_online_mask
+ * instead of the rd span.
+ *
+ * If an entire pd is outside of the current rd, it will not appear in
+ * its pd list and will not be accounted by compute_energy().
+ */
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
/*
- * The energy model mandates all the CPUs of a performance
- * domain have the same capacity.
+ * Busy time computation: utilization clamping is not
+ * required since the ratio (sum_util / cpu_capacity)
+ * is already enough to scale the EM reported power
+ * consumption at the (eventually clamped) cpu_capacity.
*/
- cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- max_util = sum_util = 0;
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ ENERGY_UTIL, NULL);
/*
- * The capacity state of CPUs of the current rd can be driven by
- * CPUs of another rd if they belong to the same performance
- * domain. So, account for the utilization of these CPUs too
- * by masking pd with cpu_online_mask instead of the rd span.
- *
- * If an entire performance domain is outside of the current rd,
- * it will not appear in its pd list and will not be accounted
- * by compute_energy().
+ * Performance domain frequency: utilization clamping
+ * must be considered since it affects the selection
+ * of the performance domain frequency.
+ * NOTE: in case RT tasks are running, by default the
+ * FREQUENCY_UTIL's utilization can be max OPP.
*/
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- util_cfs = cpu_util_next(cpu, p, dst_cpu);
-
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- ENERGY_UTIL, NULL);
-
- /*
- * Performance domain frequency: utilization clamping
- * must be considered since it affects the selection
- * of the performance domain frequency.
- * NOTE: in case RT tasks are running, by default the
- * FREQUENCY_UTIL's utilization can be max OPP.
- */
- tsk = cpu == dst_cpu ? p : NULL;
- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
- }
-
- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
- return energy;
+ return em_pd_energy(pd->em_pd, max_util, sum_util);
}
/*
@@ -6327,21 +6346,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* other use-cases too. So, until someone finds a better way to solve this,
* let's keep things simple by re-using the existing slow path.
*/
-
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ unsigned long cpu_cap, util, base_energy = 0;
int cpu, best_energy_cpu = prev_cpu;
- struct perf_domain *head, *pd;
- unsigned long cpu_cap, util;
struct sched_domain *sd;
+ struct perf_domain *pd;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd || READ_ONCE(rd->overutilized))
goto fail;
- head = pd;
/*
* Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6358,9 +6375,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
goto unlock;
for (; pd; pd = pd->next) {
- unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
+ /* Compute the 'base' energy of the pd, without @p */
+ base_energy_pd = compute_energy(p, -1, pd);
+ base_energy += base_energy_pd;
+
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
@@ -6368,14 +6390,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
/* Skip CPUs that will be overutilized. */
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- if (cpu_cap * 1024 < util * capacity_margin)
+ if (!fits_capacity(util, cpu_cap))
continue;
/* Always use prev_cpu as a candidate. */
if (cpu == prev_cpu) {
- prev_energy = compute_energy(p, prev_cpu, head);
- best_energy = min(best_energy, prev_energy);
- continue;
+ prev_delta = compute_energy(p, prev_cpu, pd);
+ prev_delta -= base_energy_pd;
+ best_delta = min(best_delta, prev_delta);
}
/*
@@ -6391,9 +6413,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
/* Evaluate the energy impact of using this CPU. */
if (max_spare_cap_cpu >= 0) {
- cur_energy = compute_energy(p, max_spare_cap_cpu, head);
- if (cur_energy < best_energy) {
- best_energy = cur_energy;
+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
+ cur_delta -= base_energy_pd;
+ if (cur_delta < best_delta) {
+ best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
}
}
@@ -6405,10 +6428,10 @@ unlock:
* Pick the best CPU if prev_cpu cannot be used, or if it saves at
* least 6% of the energy used by prev_cpu.
*/
- if (prev_energy == ULONG_MAX)
+ if (prev_delta == ULONG_MAX)
return best_energy_cpu;
- if ((prev_energy - best_energy) > (prev_energy >> 4))
+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
return best_energy_cpu;
return prev_cpu;
@@ -6742,7 +6765,7 @@ again:
goto idle;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (prev->sched_class != &fair_sched_class)
+ if (!prev || prev->sched_class != &fair_sched_class)
goto simple;
/*
@@ -6819,8 +6842,8 @@ again:
goto done;
simple:
#endif
-
- put_prev_task(rq, prev);
+ if (prev)
+ put_prev_task(rq, prev);
do {
se = pick_next_entity(cfs_rq, NULL);
@@ -6848,11 +6871,13 @@ done: __maybe_unused;
return p;
idle:
- update_misfit_status(NULL, rq);
- new_tasks = idle_balance(rq, rf);
+ if (!rf)
+ return NULL;
+
+ new_tasks = newidle_balance(rq, rf);
/*
- * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
* must re-start the pick_next_entity() loop.
*/
@@ -6874,7 +6899,7 @@ idle:
/*
* Account for a descheduled task:
*/
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_entity *se = &prev->se;
struct cfs_rq *cfs_rq;
@@ -7376,7 +7401,7 @@ static int detach_tasks(struct lb_env *env)
detached++;
env->imbalance -= load;
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is detached to minimize
@@ -7923,8 +7948,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
static inline bool
group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
- return sg->sgc->min_capacity * capacity_margin <
- ref->sgc->min_capacity * 1024;
+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
}
/*
@@ -7934,8 +7958,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
static inline bool
group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
- return sg->sgc->max_capacity * capacity_margin <
- ref->sgc->max_capacity * 1024;
+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
}
static inline enum
@@ -8993,9 +9016,10 @@ more_balance:
out_balanced:
/*
* We reach balance although we may have faced some affinity
- * constraints. Clear the imbalance flag if it was set.
+ * constraints. Clear the imbalance flag only if other tasks got
+ * a chance to move and fix the imbalance.
*/
- if (sd_parent) {
+ if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
@@ -9016,10 +9040,10 @@ out_one_pinned:
ld_moved = 0;
/*
- * idle_balance() disregards balance intervals, so we could repeatedly
- * reach this code, which would lead to balance_interval skyrocketting
- * in a short amount of time. Skip the balance_interval increase logic
- * to avoid that.
+ * newidle_balance() disregards balance intervals, so we could
+ * repeatedly reach this code, which would lead to balance_interval
+ * skyrocketting in a short amount of time. Skip the balance_interval
+ * increase logic to avoid that.
*/
if (env.idle == CPU_NEWLY_IDLE)
goto out;
@@ -9729,7 +9753,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
{
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -9737,6 +9761,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
int pulled_task = 0;
u64 curr_cost = 0;
+ update_misfit_status(NULL, this_rq);
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -10121,9 +10146,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* This routine is mostly called to set cfs_rq->curr field when a task
* migrates between groups/classes.
*/
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p)
{
- struct sched_entity *se = &rq->curr->se;
+ struct sched_entity *se = &p->se;
+
+#ifdef CONFIG_SMP
+ if (task_on_rq_queued(p)) {
+ /*
+ * Move the next running task to the front of the list, so our
+ * cfs_tasks list becomes MRU one.
+ */
+ list_move(&se->group_node, &rq->cfs_tasks);
+ }
+#endif
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10241,18 +10276,18 @@ err:
void online_fair_sched_group(struct task_group *tg)
{
struct sched_entity *se;
+ struct rq_flags rf;
struct rq *rq;
int i;
for_each_possible_cpu(i) {
rq = cpu_rq(i);
se = tg->se[i];
-
- raw_spin_lock_irq(&rq->lock);
+ rq_lock_irq(rq, &rf);
update_rq_clock(rq);
attach_entity_cfs_rq(se);
sync_throttle(tg, i);
- raw_spin_unlock_irq(&rq->lock);
+ rq_unlock_irq(rq, &rf);
}
}
@@ -10394,7 +10429,9 @@ const struct sched_class fair_sched_class = {
.check_preempt_curr = check_preempt_wakeup,
.pick_next_task = pick_next_task_fair,
+
.put_prev_task = put_prev_task_fair,
+ .set_next_task = set_next_task_fair,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_fair,
@@ -10407,7 +10444,6 @@ const struct sched_class fair_sched_class = {
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_fair,
.task_tick = task_tick_fair,
.task_fork = task_fork_fair,
@@ -10444,18 +10480,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
{
int node;
unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
+ struct numa_group *ng;
+ rcu_read_lock();
+ ng = rcu_dereference(p->numa_group);
for_each_online_node(node) {
if (p->numa_faults) {
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
}
- if (p->numa_group) {
- gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
- gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ if (ng) {
+ gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+ gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
}
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
}
+ rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80940939b733..c892c6280c9f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -241,13 +241,14 @@ static void do_idle(void)
check_pgt_cache();
rmb();
+ local_irq_disable();
+
if (cpu_is_offline(cpu)) {
- tick_nohz_idle_stop_tick_protected();
+ tick_nohz_idle_stop_tick();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
- local_irq_disable();
arch_cpu_idle_enter();
/*
@@ -311,7 +312,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void play_idle(unsigned long duration_ms)
+void play_idle(unsigned long duration_us)
{
struct idle_timer it;
@@ -323,7 +324,7 @@ void play_idle(unsigned long duration_ms)
WARN_ON_ONCE(current->nr_cpus_allowed != 1);
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
- WARN_ON_ONCE(!duration_ms);
+ WARN_ON_ONCE(!duration_us);
rcu_sleep_check();
preempt_disable();
@@ -333,7 +334,8 @@ void play_idle(unsigned long duration_ms)
it.done = 0;
hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
it.timer.function = idle_inject_timer_fn;
- hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+ hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC),
+ HRTIMER_MODE_REL_PINNED);
while (!READ_ONCE(it.done))
do_idle();
@@ -374,14 +376,27 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_curr(rq);
}
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+}
+
+static void set_next_task_idle(struct rq *rq, struct task_struct *next)
{
- put_prev_task(rq, prev);
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
+}
+
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *next = rq->idle;
- return rq->idle;
+ if (prev)
+ put_prev_task(rq, prev);
+
+ set_next_task_idle(rq, next);
+
+ return next;
}
/*
@@ -397,10 +412,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
raw_spin_lock_irq(&rq->lock);
}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-}
-
/*
* scheduler tick hitting a task of our scheduling class.
*
@@ -413,10 +424,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-
static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
BUG();
@@ -451,13 +458,13 @@ const struct sched_class idle_sched_class = {
.pick_next_task = pick_next_task_idle,
.put_prev_task = put_prev_task_idle,
+ .set_next_task = set_next_task_idle,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_idle,
.task_tick = task_tick_idle,
.get_rr_interval = get_rr_interval_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 123ea07a3f3b..9fcb2a695a41 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -14,11 +14,25 @@ EXPORT_SYMBOL_GPL(housekeeping_overridden);
static cpumask_var_t housekeeping_mask;
static unsigned int housekeeping_flags;
+bool housekeeping_enabled(enum hk_flags flags)
+{
+ return !!(housekeeping_flags & flags);
+}
+EXPORT_SYMBOL_GPL(housekeeping_enabled);
+
int housekeeping_any_cpu(enum hk_flags flags)
{
- if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
+ int cpu;
+
+ if (static_branch_unlikely(&housekeeping_overridden)) {
+ if (housekeeping_flags & flags) {
+ cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+ if (cpu < nr_cpu_ids)
+ return cpu;
+
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ }
+ }
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7acc632c3b82..517e3719027e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1051,7 +1051,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (!rcu_access_pointer(group->poll_kworker)) {
struct sched_param param = {
- .sched_priority = MAX_RT_PRIO - 1,
+ .sched_priority = 1,
};
struct kthread_worker *kworker;
@@ -1061,7 +1061,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
mutex_unlock(&group->trigger_lock);
return ERR_CAST(kworker);
}
- sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+ sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
kthread_init_delayed_work(&group->poll_work,
psi_poll_work);
rcu_assign_pointer(group->poll_kworker, kworker);
@@ -1131,7 +1131,15 @@ static void psi_trigger_destroy(struct kref *ref)
* deadlock while waiting for psi_poll_work to acquire trigger_lock
*/
if (kworker_to_destroy) {
+ /*
+ * After the RCU grace period has expired, the worker
+ * can no longer be found through group->poll_kworker.
+ * But it might have been already scheduled before
+ * that - deschedule it cleanly before destroying it.
+ */
kthread_cancel_delayed_work_sync(&group->poll_work);
+ atomic_set(&group->poll_scheduled, 0);
+
kthread_destroy_worker(kworker_to_destroy);
}
kfree(t);
@@ -1190,7 +1198,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
- buf_size = min(nbytes, (sizeof(buf) - 1));
+ buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a532558a5176..ebaa4e619684 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -45,8 +45,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
raw_spin_lock_init(&rt_b->rt_runtime_lock);
- hrtimer_init(&rt_b->rt_period_timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
@@ -67,7 +67,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
* to update the period.
*/
hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
- hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+ hrtimer_start_expires(&rt_b->rt_period_timer,
+ HRTIMER_MODE_ABS_PINNED_HARD);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -1498,12 +1499,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+
+ /*
+ * If prev task was rt, put_prev_task() has already updated the
+ * utilization. We only care of the case where we start to schedule a
+ * rt task
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+ rt_queue_push_tasks(rq);
}
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -1543,56 +1554,19 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
- if (need_pull_rt_task(rq, prev)) {
- /*
- * This is OK, because current is on_cpu, which avoids it being
- * picked for load-balance and preemption/IRQs are still
- * disabled avoiding further scheduler activity on it and we're
- * being very careful to re-start the picking loop.
- */
- rq_unpin_lock(rq, rf);
- pull_rt_task(rq);
- rq_repin_lock(rq, rf);
- /*
- * pull_rt_task() can drop (and re-acquire) rq->lock; this
- * means a dl or stop task can slip in, in which case we need
- * to re-start task selection.
- */
- if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
- rq->dl.dl_nr_running))
- return RETRY_TASK;
- }
-
- /*
- * We may dequeue prev's rt_rq in put_prev_task().
- * So, we update time before rt_queued check.
- */
- if (prev->sched_class == &rt_sched_class)
- update_curr_rt(rq);
+ WARN_ON_ONCE(prev || rf);
if (!rt_rq->rt_queued)
return NULL;
- put_prev_task(rq, prev);
-
p = _pick_next_task_rt(rq);
- set_next_task(rq, p);
-
- rt_queue_push_tasks(rq);
-
- /*
- * If prev task was rt, put_prev_task() has already updated the
- * utilization. We only care of the case where we start to schedule a
- * rt task
- */
- if (rq->curr->sched_class != &rt_sched_class)
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+ set_next_task_rt(rq, p);
return p;
}
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{
update_curr_rt(rq);
@@ -1604,6 +1578,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
*/
if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet started the picking loop.
+ */
+ rq_unpin_lock(rq, rf);
+ pull_rt_task(rq);
+ rq_repin_lock(rq, rf);
+ }
}
#ifdef CONFIG_SMP
@@ -2304,8 +2290,10 @@ static void watchdog(struct rq *rq, struct task_struct *p)
}
next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
- if (p->rt.timeout > next)
- p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
+ if (p->rt.timeout > next) {
+ posix_cputimers_rt_watchdog(&p->posix_cputimers,
+ p->se.sum_exec_runtime);
+ }
}
}
#else
@@ -2354,11 +2342,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
}
}
-static void set_curr_task_rt(struct rq *rq)
-{
- set_next_task(rq, rq->curr);
-}
-
static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
{
/*
@@ -2380,6 +2363,7 @@ const struct sched_class rt_sched_class = {
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
+ .set_next_task = set_next_task_rt,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
@@ -2391,7 +2375,6 @@ const struct sched_class rt_sched_class = {
.switched_from = switched_from_rt,
#endif
- .set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 802b1f3405f2..b3cb895d14a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -335,8 +335,6 @@ struct cfs_bandwidth {
u64 quota;
u64 runtime;
s64 hierarchical_quota;
- u64 runtime_expires;
- int expires_seq;
u8 idle;
u8 period_active;
@@ -393,6 +391,16 @@ struct task_group {
#endif
struct cfs_bandwidth cfs_bandwidth;
+
+#ifdef CONFIG_UCLAMP_TASK_GROUP
+ /* The two decimal precision [%] value requested from user-space */
+ unsigned int uclamp_pct[UCLAMP_CNT];
+ /* Clamp values requested for a task group */
+ struct uclamp_se uclamp_req[UCLAMP_CNT];
+ /* Effective clamp values used for a task group */
+ struct uclamp_se uclamp[UCLAMP_CNT];
+#endif
+
};
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -483,7 +491,8 @@ struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running;
- unsigned int h_nr_running;
+ unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int idle_h_nr_running; /* SCHED_IDLE */
u64 exec_clock;
u64 min_vruntime;
@@ -556,8 +565,6 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
- int expires_seq;
- u64 runtime_expires;
s64 runtime_remaining;
u64 throttled_clock;
@@ -777,9 +784,6 @@ struct root_domain {
struct perf_domain __rcu *pd;
};
-extern struct root_domain def_root_domain;
-extern struct mutex sched_domains_mutex;
-
extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
@@ -1261,16 +1265,18 @@ enum numa_topology_type {
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-#endif
-
-#ifdef CONFIG_NUMA
extern void sched_init_numa(void);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
#else
static inline void sched_init_numa(void) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ return nr_cpu_ids;
+}
#endif
#ifdef CONFIG_NUMA_BALANCING
@@ -1449,10 +1455,14 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
+extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
#else
static inline void sched_ttwu_pending(void) { }
+static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1700,17 +1710,21 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
- * It is the responsibility of the pick_next_task() method that will
- * return the next task to call put_prev_task() on the @prev task or
- * something equivalent.
+ * Both @prev and @rf are optional and may be NULL, in which case the
+ * caller must already have invoked put_prev_task(rq, prev, rf).
+ *
+ * Otherwise it is the responsibility of the pick_next_task() to call
+ * put_prev_task() on the @prev task or something equivalent, IFF it
+ * returns a next task.
*
- * May return RETRY_TASK when it finds a higher prio class has runnable
- * tasks.
+ * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
+ * higher prio class has runnable tasks.
*/
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
- void (*put_prev_task)(struct rq *rq, struct task_struct *p);
+ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
+ void (*set_next_task)(struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
@@ -1725,7 +1739,6 @@ struct sched_class {
void (*rq_offline)(struct rq *rq);
#endif
- void (*set_curr_task)(struct rq *rq);
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
void (*task_fork)(struct task_struct *p);
void (*task_dead)(struct task_struct *p);
@@ -1755,12 +1768,14 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- prev->sched_class->put_prev_task(rq, prev);
+ WARN_ON_ONCE(rq->curr != prev);
+ prev->sched_class->put_prev_task(rq, prev, NULL);
}
-static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
- curr->sched_class->set_curr_task(rq);
+ WARN_ON_ONCE(rq->curr != next);
+ next->sched_class->set_next_task(rq, next);
}
#ifdef CONFIG_SMP
@@ -1943,7 +1958,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
#endif
#ifdef CONFIG_SMP
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1995,7 +2010,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
return ret;
}
-#endif /* CONFIG_PREEMPT */
+#endif /* CONFIG_PREEMPTION */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -2266,7 +2281,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
-unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index aa0de240fb41..ba683fe81a6e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
{
unsigned long long now = rq_clock(rq), delta = 0;
- if (unlikely(sched_info_on()))
+ if (sched_info_on()) {
if (t->sched_info.last_queued)
delta = now - t->sched_info.last_queued;
+ }
sched_info_reset_dequeued(t);
t->sched_info.run_delay += delta;
@@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
*/
static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
- if (unlikely(sched_info_on())) {
+ if (sched_info_on()) {
if (!t->sched_info.last_queued)
t->sched_info.last_queued = rq_clock(rq);
}
@@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
static inline void
sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
- if (unlikely(sched_info_on()))
+ if (sched_info_on())
__sched_info_switch(rq, prev, next);
}
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c183b790ca54..7e1cee4e65b2 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,17 +23,22 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
+static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
+{
+ stop->se.exec_start = rq_clock_task(rq);
+}
+
static struct task_struct *
pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *stop = rq->stop;
+ WARN_ON_ONCE(prev || rf);
+
if (!stop || !task_on_rq_queued(stop))
return NULL;
- put_prev_task(rq, prev);
-
- stop->se.exec_start = rq_clock_task(rq);
+ set_next_task_stop(rq, stop);
return stop;
}
@@ -55,7 +60,7 @@ static void yield_task_stop(struct rq *rq)
BUG(); /* the stop task should never yield, its pointless. */
}
-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct task_struct *curr = rq->curr;
u64 delta_exec;
@@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
{
}
-static void set_curr_task_stop(struct rq *rq)
-{
- struct task_struct *stop = rq->stop;
-
- stop->se.exec_start = rq_clock_task(rq);
-}
-
static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
@@ -128,13 +126,13 @@ const struct sched_class stop_sched_class = {
.pick_next_task = pick_next_task_stop,
.put_prev_task = put_prev_task_stop,
+ .set_next_task = set_next_task_stop,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_stop,
.set_cpus_allowed = set_cpus_allowed_common,
#endif
- .set_curr_task = set_curr_task_stop,
.task_tick = task_tick_stop,
.get_rr_interval = get_rr_interval_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f751ce0b783e..b5667a273bf6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1284,6 +1284,7 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
#endif
/*
@@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
sd->flags &= ~(SD_BALANCE_EXEC |
SD_BALANCE_FORK |
SD_WAKE_AFFINE);
@@ -1724,6 +1725,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
}
}
+/*
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
+ * closest to @cpu from @cpumask.
+ * cpumask: cpumask to find a cpu from
+ * cpu: cpu to be close to
+ *
+ * returns: cpu, or nr_cpu_ids when nothing found.
+ */
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ int i, j = cpu_to_node(cpu);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+ return nr_cpu_ids;
+}
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -2149,16 +2170,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
* ndoms_new == 0 is a special case for destroying existing domains,
* and it will not create the default domain.
*
- * Call with hotplug lock held
+ * Call with hotplug lock and sched_domains_mutex held
*/
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
int i, j, n;
int new_topology;
- mutex_lock(&sched_domains_mutex);
+ lockdep_assert_held(&sched_domains_mutex);
/* Always unregister in case we don't destroy any domains: */
unregister_sched_domain_sysctl();
@@ -2183,8 +2204,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) &&
- dattrs_equal(dattr_cur, i, dattr_new, j))
+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
+ struct root_domain *rd;
+
+ /*
+ * This domain won't be destroyed and as such
+ * its dl_bw->total_bw needs to be cleared. It
+ * will be recomputed in function
+ * update_tasks_root_domain().
+ */
+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
+ dl_clear_root_domain(rd);
goto match1;
+ }
}
/* No match - a current sched domain not in new doms_new[] */
detach_destroy_domains(doms_cur[i]);
@@ -2241,6 +2273,15 @@ match3:
ndoms_cur = ndoms_new;
register_sched_domain_sysctl();
+}
+/*
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+ struct sched_domain_attr *dattr_new)
+{
+ mutex_lock(&sched_domains_mutex);
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
mutex_unlock(&sched_domains_mutex);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index dabe100d2091..c4da1ef56fdf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -90,6 +90,11 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return true;
+ /* Only allow kernel generated signals to this kthread */
+ if (unlikely((t->flags & PF_KTHREAD) &&
+ (handler == SIG_KTHREAD_KERNEL) && !force))
+ return true;
+
return sig_handler_ignored(handler, sig);
}
@@ -349,7 +354,7 @@ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
* @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
* Group stop states are cleared and the group stop count is consumed if
* %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
- * stop, the appropriate %SIGNAL_* flags are set.
+ * stop, the appropriate `SIGNAL_*` flags are set.
*
* CONTEXT:
* Must be called with @task->sighand->siglock held.
@@ -1885,6 +1890,7 @@ static void do_notify_pidfd(struct task_struct *task)
{
struct pid *pid;
+ WARN_ON(task->exit_state == 0);
pid = task_pid(task);
wake_up_all(&pid->wait_pidfd);
}
@@ -2951,80 +2957,49 @@ EXPORT_SYMBOL(sigprocmask);
*
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
*/
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
- sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
- if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+ if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_user_sigmask);
#ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
- sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
size_t sigsetsize)
{
- if (!usigmask)
- return 0;
+ sigset_t kmask;
+ if (!umask)
+ return 0;
if (sigsetsize != sizeof(compat_sigset_t))
return -EINVAL;
- if (get_compat_sigset(set, usigmask))
+ if (get_compat_sigset(&kmask, umask))
return -EFAULT;
- *oldset = current->blocked;
- set_current_blocked(set);
+ set_restore_sigmask();
+ current->saved_sigmask = current->blocked;
+ set_current_blocked(&kmask);
return 0;
}
-EXPORT_SYMBOL(set_compat_user_sigmask);
#endif
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- * usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
- bool interrupted)
-{
-
- if (!usigmask)
- return;
- /*
- * When signals are pending, do not restore them here.
- * Restoring sigmask here can lead to delivering signals that the above
- * syscalls are intended to block because of the sigmask passed in.
- */
- if (interrupted) {
- current->saved_sigmask = *sigsaved;
- set_restore_sigmask();
- return;
- }
-
- /*
- * This is needed because the fast syscall return path does not restore
- * saved_sigmask when signals are not pending.
- */
- set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
/**
* sys_rt_sigprocmask - change the list of currently blocked signals
* @how: whether to add, remove, or set signals
@@ -3703,8 +3678,11 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
static struct pid *pidfd_to_pid(const struct file *file)
{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
+ struct pid *pid;
+
+ pid = pidfd_pid(file);
+ if (!IS_ERR(pid))
+ return pid;
return tgid_pidfd_to_pid(file);
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 616d4d114847..7dbcb402c2fc 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -291,6 +291,14 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress);
+ /*
+ * When @wait we can deadlock when we interrupt between llist_add() and
+ * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
+ * csd_lock() on because the interrupt context uses the same csd
+ * storage.
+ */
+ WARN_ON_ONCE(!in_task());
+
csd = &csd_stack;
if (!wait) {
csd = this_cpu_ptr(&csd_data);
@@ -416,6 +424,14 @@ void smp_call_function_many(const struct cpumask *mask,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress && !early_boot_irqs_disabled);
+ /*
+ * When @wait we can deadlock when we interrupt between llist_add() and
+ * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
+ * csd_lock() on because the interrupt context uses the same csd
+ * storage.
+ */
+ WARN_ON_ONCE(!in_task());
+
/* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index e6a02b274b73..6d1f68b7e528 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -20,7 +20,7 @@
* @nr_entries: Number of entries in the storage array
* @spaces: Number of leading spaces to print
*/
-void stack_trace_print(unsigned long *entries, unsigned int nr_entries,
+void stack_trace_print(const unsigned long *entries, unsigned int nr_entries,
int spaces)
{
unsigned int i;
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(stack_trace_print);
*
* Return: Number of bytes printed.
*/
-int stack_trace_snprint(char *buf, size_t size, unsigned long *entries,
+int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
unsigned int nr_entries, int spaces)
{
unsigned int generated, i, total = 0;
@@ -226,12 +226,17 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
.store = store,
.size = size,
};
+ mm_segment_t fs;
/* Trace user stack if not a kernel thread */
if (current->flags & PF_KTHREAD)
return 0;
+ fs = get_fs();
+ set_fs(USER_DS);
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
+ set_fs(fs);
+
return c.len;
}
#endif
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4f83f7bdf86..c7031a22aa7b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
*/
preempt_disable();
stop_cpus_in_progress = true;
+ barrier();
for_each_cpu(cpu, cpumask) {
work = &per_cpu(cpu_stopper.stop_work, cpu);
work->fn = fn;
@@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
+ barrier();
stop_cpus_in_progress = false;
preempt_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index 2969304c29fe..a611d1d58c7d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -103,12 +103,6 @@
#ifndef SET_TSC_CTL
# define SET_TSC_CTL(a) (-EINVAL)
#endif
-#ifndef MPX_ENABLE_MANAGEMENT
-# define MPX_ENABLE_MANAGEMENT() (-EINVAL)
-#endif
-#ifndef MPX_DISABLE_MANAGEMENT
-# define MPX_DISABLE_MANAGEMENT() (-EINVAL)
-#endif
#ifndef GET_FP_MODE
# define GET_FP_MODE(a) (-EINVAL)
#endif
@@ -124,6 +118,12 @@
#ifndef PAC_RESET_KEYS
# define PAC_RESET_KEYS(a, b) (-EINVAL)
#endif
+#ifndef SET_TAGGED_ADDR_CTRL
+# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
+#endif
+#ifndef GET_TAGGED_ADDR_CTRL
+# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -1557,15 +1557,6 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
retval = -EPERM;
if (!retval)
retval = security_task_setrlimit(tsk, resource, new_rlim);
- if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) {
- /*
- * The caller is asking for an immediate RLIMIT_CPU
- * expiry. But we use the zero value to mean "it was
- * never set". So let's cheat and make it one second
- * instead
- */
- new_rlim->rlim_cur = 1;
- }
}
if (!retval) {
if (old_rlim)
@@ -1576,10 +1567,9 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
task_unlock(tsk->group_leader);
/*
- * RLIMIT_CPU handling. Note that the kernel fails to return an error
- * code if it rejected the user's attempt to set RLIMIT_CPU. This is a
- * very long-standing error, and fixing it now risks breakage of
- * applications, so we live with it
+ * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+ * infite. In case of RLIM_INFINITY the posix CPU timer code
+ * ignores the rlimit.
*/
if (!retval && new_rlim && resource == RLIMIT_CPU &&
new_rlim->rlim_cur != RLIM_INFINITY &&
@@ -2456,15 +2446,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
up_write(&me->mm->mmap_sem);
break;
case PR_MPX_ENABLE_MANAGEMENT:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = MPX_ENABLE_MANAGEMENT();
- break;
case PR_MPX_DISABLE_MANAGEMENT:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = MPX_DISABLE_MANAGEMENT();
- break;
+ /* No longer implemented: */
+ return -EINVAL;
case PR_SET_FP_MODE:
error = SET_FP_MODE(me, arg2);
break;
@@ -2492,6 +2476,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = PAC_RESET_KEYS(me, arg2);
break;
+ case PR_SET_TAGGED_ADDR_CTRL:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = SET_TAGGED_ADDR_CTRL(arg2);
+ break;
+ case PR_GET_TAGGED_ADDR_CTRL:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = GET_TAGGED_ADDR_CTRL();
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1c1ad1e14f21..078950d9605b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,9 +125,6 @@ static int sixty = 60;
#endif
static int __maybe_unused neg_one = -1;
-
-static int zero;
-static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static unsigned long zero_ul;
@@ -188,17 +185,17 @@ extern int no_unaligned_warning;
* enum sysctl_writes_mode - supported sysctl write modes
*
* @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value
- * to be written, and multiple writes on the same sysctl file descriptor
- * will rewrite the sysctl value, regardless of file position. No warning
- * is issued when the initial position is not 0.
+ * to be written, and multiple writes on the same sysctl file descriptor
+ * will rewrite the sysctl value, regardless of file position. No warning
+ * is issued when the initial position is not 0.
* @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is
- * not 0.
+ * not 0.
* @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at
- * file position 0 and the value must be fully contained in the buffer
- * sent to the write syscall. If dealing with strings respect the file
- * position, but restrict this to the max length of the buffer, anything
- * passed the max lenght will be ignored. Multiple writes will append
- * to the buffer.
+ * file position 0 and the value must be fully contained in the buffer
+ * sent to the write syscall. If dealing with strings respect the file
+ * position, but restrict this to the max length of the buffer, anything
+ * passed the max length will be ignored. Multiple writes will append
+ * to the buffer.
*
* These write modes control how current file position affects the behavior of
* updating sysctl values through the proc interface on each write.
@@ -385,8 +382,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_schedstats,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
@@ -418,7 +415,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "numa_balancing",
@@ -426,8 +423,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
@@ -475,8 +472,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -486,7 +483,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
#endif
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -496,8 +493,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_energy_aware_handler,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_PROVE_LOCKING
@@ -562,7 +559,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &neg_one,
- .extra2 = &one,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_LATENCYTOP
@@ -696,8 +693,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
/* only handle a transition from default "0" to "1" */
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &one,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_MODULES
@@ -715,8 +712,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
/* only handle a transition from default "0" to "1" */
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &one,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_UEVENT_HELPER
@@ -875,7 +872,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &ten_thousand,
},
{
@@ -891,8 +888,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax_sysadmin,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "kptr_restrict",
@@ -900,7 +897,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax_sysadmin,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
#endif
@@ -925,8 +922,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "watchdog_thresh",
@@ -934,7 +931,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_watchdog_thresh,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &sixty,
},
{
@@ -943,8 +940,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = NMI_WATCHDOG_SYSCTL_PERM,
.proc_handler = proc_nmi_watchdog,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "watchdog_cpumask",
@@ -960,8 +957,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_soft_watchdog,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "softlockup_panic",
@@ -969,8 +966,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#ifdef CONFIG_SMP
{
@@ -979,8 +976,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SMP */
#endif
@@ -991,8 +988,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#ifdef CONFIG_SMP
{
@@ -1001,8 +998,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SMP */
#endif
@@ -1115,8 +1112,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "hung_task_check_count",
@@ -1124,7 +1121,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "hung_task_timeout_secs",
@@ -1201,7 +1198,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
.proc_handler = perf_proc_update_handler,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "perf_cpu_time_max_percent",
@@ -1209,7 +1206,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
.mode = 0644,
.proc_handler = perf_cpu_time_max_percent_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
@@ -1218,7 +1215,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_max_stack),
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &six_hundred_forty_kb,
},
{
@@ -1227,7 +1224,7 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
.mode = 0644,
.proc_handler = perf_event_max_stack_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_thousand,
},
#endif
@@ -1237,8 +1234,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
{
@@ -1247,8 +1244,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = timer_migration_handler,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_BPF_SYSCALL
@@ -1259,8 +1256,8 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
/* only handle a transition from default "0" to "1" */
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &one,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "bpf_stats_enabled",
@@ -1277,8 +1274,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(sysctl_panic_on_rcu_stall),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
@@ -1288,8 +1285,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = stack_erasing_sysctl,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{ }
@@ -1302,7 +1299,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
@@ -1311,7 +1308,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_panic_on_oom),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
@@ -1348,7 +1345,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "dirty_background_ratio",
@@ -1356,7 +1353,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(dirty_background_ratio),
.mode = 0644,
.proc_handler = dirty_background_ratio_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
@@ -1373,7 +1370,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(vm_dirty_ratio),
.mode = 0644,
.proc_handler = dirty_ratio_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
@@ -1397,7 +1394,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "dirtytime_expire_seconds",
@@ -1405,7 +1402,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(dirtytime_expire_interval),
.mode = 0644,
.proc_handler = dirtytime_interval_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "swappiness",
@@ -1413,7 +1410,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(vm_swappiness),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
#ifdef CONFIG_HUGETLB_PAGE
@@ -1438,8 +1435,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sysctl_vm_numa_stat_handler,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{
@@ -1470,7 +1467,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = drop_caches_sysctl_handler,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
.extra2 = &four,
},
#ifdef CONFIG_COMPACTION
@@ -1496,8 +1493,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_COMPACTION */
@@ -1507,7 +1504,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(min_free_kbytes),
.mode = 0644,
.proc_handler = min_free_kbytes_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "watermark_boost_factor",
@@ -1515,7 +1512,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(watermark_boost_factor),
.mode = 0644,
.proc_handler = watermark_boost_factor_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "watermark_scale_factor",
@@ -1523,7 +1520,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(watermark_scale_factor),
.mode = 0644,
.proc_handler = watermark_scale_factor_sysctl_handler,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
.extra2 = &one_thousand,
},
{
@@ -1532,7 +1529,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#ifdef CONFIG_MMU
{
@@ -1541,7 +1538,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_max_map_count),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#else
{
@@ -1550,7 +1547,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_nr_trim_pages),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#endif
{
@@ -1566,7 +1563,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(block_dump),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "vfs_cache_pressure",
@@ -1574,7 +1571,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_vfs_cache_pressure),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
{
@@ -1583,7 +1580,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_legacy_va_layout),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#endif
#ifdef CONFIG_NUMA
@@ -1593,7 +1590,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(node_reclaim_mode),
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "min_unmapped_ratio",
@@ -1601,7 +1598,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_min_unmapped_ratio),
.mode = 0644,
.proc_handler = sysctl_min_unmapped_ratio_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
@@ -1610,7 +1607,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_min_slab_ratio),
.mode = 0644,
.proc_handler = sysctl_min_slab_ratio_sysctl_handler,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
#endif
@@ -1661,7 +1658,7 @@ static struct ctl_table vm_table[] = {
#endif
.mode = 0644,
.proc_handler = proc_dointvec,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#endif
#ifdef CONFIG_HIGHMEM
@@ -1671,8 +1668,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(vm_highmem_is_dirtyable),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
#ifdef CONFIG_MEMORY_FAILURE
@@ -1682,8 +1679,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_memory_failure_early_kill),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "memory_failure_recovery",
@@ -1691,8 +1688,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_memory_failure_recovery),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{
@@ -1738,8 +1735,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(sysctl_unprivileged_userfaultfd),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{ }
@@ -1875,8 +1872,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "protected_hardlinks",
@@ -1884,8 +1881,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "protected_fifos",
@@ -1893,7 +1890,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
@@ -1902,7 +1899,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
@@ -1911,7 +1908,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax_coredump,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
@@ -1948,7 +1945,7 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
{ }
};
@@ -1970,8 +1967,8 @@ static struct ctl_table debug_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_kprobes_optimization_handler,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{ }
@@ -3395,8 +3392,8 @@ int proc_do_static_key(struct ctl_table *table, int write,
.data = &val,
.maxlen = sizeof(val),
.mode = table->mode,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
};
if (write && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 57518efc3810..451f9d05ccfe 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -97,7 +97,7 @@ static int alarmtimer_rtc_add_device(struct device *dev,
if (!device_may_wakeup(rtc->dev.parent))
return -1;
- __ws = wakeup_source_register("alarmtimer");
+ __ws = wakeup_source_register(dev, "alarmtimer");
spin_lock_irqsave(&rtcdev_lock, flags);
if (!rtcdev) {
@@ -432,7 +432,7 @@ int alarm_cancel(struct alarm *alarm)
int ret = alarm_try_to_cancel(alarm);
if (ret >= 0)
return ret;
- cpu_relax();
+ hrtimer_cancel_wait_running(&alarm->timer);
}
}
EXPORT_SYMBOL_GPL(alarm_cancel);
@@ -606,6 +606,19 @@ static int alarm_timer_try_to_cancel(struct k_itimer *timr)
}
/**
+ * alarm_timer_wait_running - Posix timer callback to wait for a timer
+ * @timr: Pointer to the posixtimer data struct
+ *
+ * Called from the core code when timer cancel detected that the callback
+ * is running. @timr is unlocked and rcu read lock is held to prevent it
+ * from being freed.
+ */
+static void alarm_timer_wait_running(struct k_itimer *timr)
+{
+ hrtimer_cancel_wait_running(&timr->it.alarm.alarmtimer.timer);
+}
+
+/**
* alarm_timer_arm - Posix timer callback to arm a timer
* @timr: Pointer to the posixtimer data struct
* @expires: The new expiry time
@@ -672,7 +685,7 @@ static int alarm_timer_create(struct k_itimer *new_timer)
enum alarmtimer_type type;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
@@ -790,7 +803,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
int ret = 0;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (flags & ~TIMER_ABSTIME)
return -EINVAL;
@@ -834,6 +847,7 @@ const struct k_clock alarm_clock = {
.timer_forward = alarm_timer_forward,
.timer_remaining = alarm_timer_remaining,
.timer_try_to_cancel = alarm_timer_try_to_cancel,
+ .timer_wait_running = alarm_timer_wait_running,
.nsleep = alarm_timer_nsleep,
};
#endif /* CONFIG_POSIX_TIMERS */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5ee77f1a8a92..0d4dc241c0fb 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -140,6 +140,11 @@ static struct hrtimer_cpu_base migration_cpu_base = {
#define migration_base migration_cpu_base.clock_base[0]
+static inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return base == &migration_base;
+}
+
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
@@ -264,6 +269,11 @@ again:
#else /* CONFIG_SMP */
+static inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return false;
+}
+
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
{
@@ -427,6 +437,17 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
}
EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+ clockid_t clock_id, enum hrtimer_mode mode);
+
+void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl,
+ clockid_t clock_id, enum hrtimer_mode mode)
+{
+ debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr);
+ __hrtimer_init_sleeper(sl, clock_id, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack);
+
void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
@@ -1096,9 +1117,13 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/*
* Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
- * match.
+ * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
+ * expiry mode because unmarked timers are moved to softirq expiry.
*/
- WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
+ else
+ WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);
base = lock_hrtimer_base(timer, &flags);
@@ -1147,6 +1172,93 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
+#ifdef CONFIG_PREEMPT_RT
+static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
+{
+ spin_lock_init(&base->softirq_expiry_lock);
+}
+
+static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
+{
+ spin_lock(&base->softirq_expiry_lock);
+}
+
+static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
+{
+ spin_unlock(&base->softirq_expiry_lock);
+}
+
+/*
+ * The counterpart to hrtimer_cancel_wait_running().
+ *
+ * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
+ * the timer callback to finish. Drop expiry_lock and reaquire it. That
+ * allows the waiter to acquire the lock and make progress.
+ */
+static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
+ unsigned long flags)
+{
+ if (atomic_read(&cpu_base->timer_waiters)) {
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ spin_unlock(&cpu_base->softirq_expiry_lock);
+ spin_lock(&cpu_base->softirq_expiry_lock);
+ raw_spin_lock_irq(&cpu_base->lock);
+ }
+}
+
+/*
+ * This function is called on PREEMPT_RT kernels when the fast path
+ * deletion of a timer failed because the timer callback function was
+ * running.
+ *
+ * This prevents priority inversion: if the soft irq thread is preempted
+ * in the middle of a timer callback, then calling del_timer_sync() can
+ * lead to two issues:
+ *
+ * - If the caller is on a remote CPU then it has to spin wait for the timer
+ * handler to complete. This can result in unbound priority inversion.
+ *
+ * - If the caller originates from the task which preempted the timer
+ * handler on the same CPU, then spin waiting for the timer handler to
+ * complete is never going to end.
+ */
+void hrtimer_cancel_wait_running(const struct hrtimer *timer)
+{
+ /* Lockless read. Prevent the compiler from reloading it below */
+ struct hrtimer_clock_base *base = READ_ONCE(timer->base);
+
+ /*
+ * Just relax if the timer expires in hard interrupt context or if
+ * it is currently on the migration base.
+ */
+ if (!timer->is_soft || is_migration_base(base)) {
+ cpu_relax();
+ return;
+ }
+
+ /*
+ * Mark the base as contended and grab the expiry lock, which is
+ * held by the softirq across the timer callback. Drop the lock
+ * immediately so the softirq can expire the next timer. In theory
+ * the timer could already be running again, but that's more than
+ * unlikely and just causes another wait loop.
+ */
+ atomic_inc(&base->cpu_base->timer_waiters);
+ spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
+ atomic_dec(&base->cpu_base->timer_waiters);
+ spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
+}
+#else
+static inline void
+hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
+static inline void
+hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void
+hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
+ unsigned long flags) { }
+#endif
+
/**
* hrtimer_cancel - cancel a timer and wait for the handler to finish.
* @timer: the timer to be cancelled
@@ -1157,13 +1269,15 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
*/
int hrtimer_cancel(struct hrtimer *timer)
{
- for (;;) {
- int ret = hrtimer_try_to_cancel(timer);
+ int ret;
- if (ret >= 0)
- return ret;
- cpu_relax();
- }
+ do {
+ ret = hrtimer_try_to_cancel(timer);
+
+ if (ret < 0)
+ hrtimer_cancel_wait_running(timer);
+ } while (ret < 0);
+ return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);
@@ -1260,8 +1374,17 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
- int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
struct hrtimer_cpu_base *cpu_base;
+ int base;
+
+ /*
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * marked for hard interrupt expiry mode are moved into soft
+ * interrupt context for latency reasons and because the callbacks
+ * can invoke functions which might sleep on RT, e.g. spin_lock().
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
+ softtimer = true;
memset(timer, 0, sizeof(struct hrtimer));
@@ -1275,8 +1398,10 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
clock_id = CLOCK_MONOTONIC;
+ base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
base += hrtimer_clockid_to_base(clock_id);
timer->is_soft = softtimer;
+ timer->is_hard = !softtimer;
timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
}
@@ -1449,6 +1574,8 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
break;
__run_hrtimer(cpu_base, base, timer, &basenow, flags);
+ if (active_mask == HRTIMER_ACTIVE_SOFT)
+ hrtimer_sync_wait_running(cpu_base, flags);
}
}
}
@@ -1459,6 +1586,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
unsigned long flags;
ktime_t now;
+ hrtimer_cpu_base_lock_expiry(cpu_base);
raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
@@ -1468,6 +1596,7 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
hrtimer_update_softirq_timer(cpu_base, true);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ hrtimer_cpu_base_unlock_expiry(cpu_base);
}
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1639,10 +1768,75 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
+/**
+ * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
+ * @sl: sleeper to be started
+ * @mode: timer mode abs/rel
+ *
+ * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
+ * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
+ */
+void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
+ enum hrtimer_mode mode)
+{
+ /*
+ * Make the enqueue delivery mode check work on RT. If the sleeper
+ * was initialized for hard interrupt delivery, force the mode bit.
+ * This is a special case for hrtimer_sleepers because
+ * hrtimer_init_sleeper() determines the delivery mode on RT so the
+ * fiddling with this decision is avoided at the call sites.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
+ mode |= HRTIMER_MODE_HARD;
+
+ hrtimer_start_expires(&sl->timer, mode);
+}
+EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
+
+static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
+ clockid_t clock_id, enum hrtimer_mode mode)
{
+ /*
+ * On PREEMPT_RT enabled kernels hrtimers which are not explicitely
+ * marked for hard interrupt expiry mode are moved into soft
+ * interrupt context either for latency reasons or because the
+ * hrtimer callback takes regular spinlocks or invokes other
+ * functions which are not suitable for hard interrupt context on
+ * PREEMPT_RT.
+ *
+ * The hrtimer_sleeper callback is RT compatible in hard interrupt
+ * context, but there is a latency concern: Untrusted userspace can
+ * spawn many threads which arm timers for the same expiry time on
+ * the same CPU. That causes a latency spike due to the wakeup of
+ * a gazillion threads.
+ *
+ * OTOH, priviledged real-time user space applications rely on the
+ * low latency of hard interrupt wakeups. If the current task is in
+ * a real-time scheduling class, mark the mode for hard interrupt
+ * expiry.
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT))
+ mode |= HRTIMER_MODE_HARD;
+ }
+
+ __hrtimer_init(&sl->timer, clock_id, mode);
sl->timer.function = hrtimer_wakeup;
- sl->task = task;
+ sl->task = current;
+}
+
+/**
+ * hrtimer_init_sleeper - initialize sleeper to the given clock
+ * @sl: sleeper to be initialized
+ * @clock_id: the clock to be used
+ * @mode: timer mode abs/rel
+ */
+void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode)
+{
+ debug_init(&sl->timer, clock_id, mode);
+ __hrtimer_init_sleeper(sl, clock_id, mode);
+
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
@@ -1669,11 +1863,9 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
{
struct restart_block *restart;
- hrtimer_init_sleeper(t, current);
-
do {
set_current_state(TASK_INTERRUPTIBLE);
- hrtimer_start_expires(&t->timer, mode);
+ hrtimer_sleeper_start_expires(t, mode);
if (likely(t->task))
freezable_schedule();
@@ -1707,10 +1899,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
struct hrtimer_sleeper t;
int ret;
- hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
- HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid,
+ HRTIMER_MODE_ABS);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
-
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer);
return ret;
@@ -1728,7 +1919,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp,
if (dl_task(current) || rt_task(current))
slack = 0;
- hrtimer_init_on_stack(&t.timer, clockid, mode);
+ hrtimer_init_sleeper_on_stack(&t, clockid, mode);
hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
ret = do_nanosleep(&t, mode);
if (ret != -ERESTART_RESTARTBLOCK)
@@ -1809,6 +2000,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
+ hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -1927,12 +2119,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return -EINTR;
}
- hrtimer_init_on_stack(&t.timer, clock_id, mode);
+ hrtimer_init_sleeper_on_stack(&t, clock_id, mode);
hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
-
- hrtimer_init_sleeper(&t, current);
-
- hrtimer_start_expires(&t.timer, mode);
+ hrtimer_sleeper_start_expires(&t, mode);
if (likely(t.task))
schedule();
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 02068b2d5862..77f1e5635cc1 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -55,15 +55,10 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
val = it->expires;
interval = it->incr;
if (val) {
- struct task_cputime cputime;
- u64 t;
+ u64 t, samples[CPUCLOCK_MAX];
- thread_group_cputimer(tsk, &cputime);
- if (clock_id == CPUCLOCK_PROF)
- t = cputime.utime + cputime.stime;
- else
- /* CPUCLOCK_VIRT */
- t = cputime.utime;
+ thread_group_sample_cputime(tsk, samples);
+ t = samples[clock_id];
if (val < t)
/* about to fire */
@@ -213,6 +208,7 @@ again:
/* We are sharing ->siglock with it_real_fn() */
if (hrtimer_try_to_cancel(timer) < 0) {
spin_unlock_irq(&tsk->sighand->siglock);
+ hrtimer_cancel_wait_running(timer);
goto again;
}
expires = timeval_to_ktime(value->it_value);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 0a426f4e3125..92a431981b1c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -20,11 +20,20 @@
static void posix_cpu_timer_rearm(struct k_itimer *timer);
+void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
+{
+ posix_cputimers_init(pct);
+ if (cpu_limit != RLIM_INFINITY) {
+ pct->bases[CPUCLOCK_PROF].nextevt = cpu_limit * NSEC_PER_SEC;
+ pct->timers_active = true;
+ }
+}
+
/*
* Called after updating RLIMIT_CPU to run cpu timer and update
- * tsk->signal->cputime_expires expiration cache if necessary. Needs
- * siglock protection since other code may update expiration cache as
- * well.
+ * tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
+ * necessary. Needs siglock protection since other code may update the
+ * expiration cache as well.
*/
void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
@@ -35,46 +44,97 @@ void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
spin_unlock_irq(&task->sighand->siglock);
}
-static int check_clock(const clockid_t which_clock)
+/*
+ * Functions for validating access to tasks.
+ */
+static struct task_struct *lookup_task(const pid_t pid, bool thread,
+ bool gettime)
{
- int error = 0;
struct task_struct *p;
- const pid_t pid = CPUCLOCK_PID(which_clock);
-
- if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX)
- return -EINVAL;
- if (pid == 0)
- return 0;
+ /*
+ * If the encoded PID is 0, then the timer is targeted at current
+ * or the process to which current belongs.
+ */
+ if (!pid)
+ return thread ? current : current->group_leader;
- rcu_read_lock();
p = find_task_by_vpid(pid);
- if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
- same_thread_group(p, current) : has_group_leader_pid(p))) {
- error = -EINVAL;
+ if (!p)
+ return p;
+
+ if (thread)
+ return same_thread_group(p, current) ? p : NULL;
+
+ if (gettime) {
+ /*
+ * For clock_gettime(PROCESS) the task does not need to be
+ * the actual group leader. tsk->sighand gives
+ * access to the group's clock.
+ *
+ * Timers need the group leader because they take a
+ * reference on it and store the task pointer until the
+ * timer is destroyed.
+ */
+ return (p == current || thread_group_leader(p)) ? p : NULL;
}
+
+ /*
+ * For processes require that p is group leader.
+ */
+ return has_group_leader_pid(p) ? p : NULL;
+}
+
+static struct task_struct *__get_task_for_clock(const clockid_t clock,
+ bool getref, bool gettime)
+{
+ const bool thread = !!CPUCLOCK_PERTHREAD(clock);
+ const pid_t pid = CPUCLOCK_PID(clock);
+ struct task_struct *p;
+
+ if (CPUCLOCK_WHICH(clock) >= CPUCLOCK_MAX)
+ return NULL;
+
+ rcu_read_lock();
+ p = lookup_task(pid, thread, gettime);
+ if (p && getref)
+ get_task_struct(p);
rcu_read_unlock();
+ return p;
+}
- return error;
+static inline struct task_struct *get_task_for_clock(const clockid_t clock)
+{
+ return __get_task_for_clock(clock, true, false);
+}
+
+static inline struct task_struct *get_task_for_clock_get(const clockid_t clock)
+{
+ return __get_task_for_clock(clock, true, true);
+}
+
+static inline int validate_clock_permissions(const clockid_t clock)
+{
+ return __get_task_for_clock(clock, false, false) ? 0 : -EINVAL;
}
/*
* Update expiry time from increment, and increase overrun count,
* given the current clock sample.
*/
-static void bump_cpu_timer(struct k_itimer *timer, u64 now)
+static u64 bump_cpu_timer(struct k_itimer *timer, u64 now)
{
+ u64 delta, incr, expires = timer->it.cpu.node.expires;
int i;
- u64 delta, incr;
if (!timer->it_interval)
- return;
+ return expires;
- if (now < timer->it.cpu.expires)
- return;
+ if (now < expires)
+ return expires;
incr = timer->it_interval;
- delta = now + incr - timer->it.cpu.expires;
+ delta = now + incr - expires;
/* Don't use (incr*2 < delta), incr*2 might overflow. */
for (i = 0; incr < delta - incr; i++)
@@ -84,48 +144,26 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now)
if (delta < incr)
continue;
- timer->it.cpu.expires += incr;
+ timer->it.cpu.node.expires += incr;
timer->it_overrun += 1LL << i;
delta -= incr;
}
+ return timer->it.cpu.node.expires;
}
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime: The struct to compare.
- *
- * Checks @cputime to see if all fields are zero. Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
+/* Check whether all cache entries contain U64_MAX, i.e. eternal expiry time */
+static inline bool expiry_cache_is_inactive(const struct posix_cputimers *pct)
{
- if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
- return 1;
- return 0;
-}
-
-static inline u64 prof_ticks(struct task_struct *p)
-{
- u64 utime, stime;
-
- task_cputime(p, &utime, &stime);
-
- return utime + stime;
-}
-static inline u64 virt_ticks(struct task_struct *p)
-{
- u64 utime, stime;
-
- task_cputime(p, &utime, &stime);
-
- return utime;
+ return !(~pct->bases[CPUCLOCK_PROF].nextevt |
+ ~pct->bases[CPUCLOCK_VIRT].nextevt |
+ ~pct->bases[CPUCLOCK_SCHED].nextevt);
}
static int
posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
{
- int error = check_clock(which_clock);
+ int error = validate_clock_permissions(which_clock);
+
if (!error) {
tp->tv_sec = 0;
tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ);
@@ -142,42 +180,66 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
}
static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
+posix_cpu_clock_set(const clockid_t clock, const struct timespec64 *tp)
{
+ int error = validate_clock_permissions(clock);
+
/*
* You can never reset a CPU clock, but we check for other errors
* in the call before failing with EPERM.
*/
- int error = check_clock(which_clock);
- if (error == 0) {
- error = -EPERM;
- }
- return error;
+ return error ? : -EPERM;
}
-
/*
- * Sample a per-thread clock for the given task.
+ * Sample a per-thread clock for the given task. clkid is validated.
*/
-static int cpu_clock_sample(const clockid_t which_clock,
- struct task_struct *p, u64 *sample)
+static u64 cpu_clock_sample(const clockid_t clkid, struct task_struct *p)
{
- switch (CPUCLOCK_WHICH(which_clock)) {
- default:
- return -EINVAL;
+ u64 utime, stime;
+
+ if (clkid == CPUCLOCK_SCHED)
+ return task_sched_runtime(p);
+
+ task_cputime(p, &utime, &stime);
+
+ switch (clkid) {
case CPUCLOCK_PROF:
- *sample = prof_ticks(p);
- break;
+ return utime + stime;
case CPUCLOCK_VIRT:
- *sample = virt_ticks(p);
- break;
- case CPUCLOCK_SCHED:
- *sample = task_sched_runtime(p);
- break;
+ return utime;
+ default:
+ WARN_ON_ONCE(1);
}
return 0;
}
+static inline void store_samples(u64 *samples, u64 stime, u64 utime, u64 rtime)
+{
+ samples[CPUCLOCK_PROF] = stime + utime;
+ samples[CPUCLOCK_VIRT] = utime;
+ samples[CPUCLOCK_SCHED] = rtime;
+}
+
+static void task_sample_cputime(struct task_struct *p, u64 *samples)
+{
+ u64 stime, utime;
+
+ task_cputime(p, &utime, &stime);
+ store_samples(samples, stime, utime, p->se.sum_exec_runtime);
+}
+
+static void proc_sample_cputime_atomic(struct task_cputime_atomic *at,
+ u64 *samples)
+{
+ u64 stime, utime, rtime;
+
+ utime = atomic64_read(&at->utime);
+ stime = atomic64_read(&at->stime);
+ rtime = atomic64_read(&at->sum_exec_runtime);
+ store_samples(samples, stime, utime, rtime);
+}
+
/*
* Set cputime to sum_cputime if sum_cputime > cputime. Use cmpxchg
* to avoid race conditions with concurrent updates to cputime.
@@ -193,29 +255,56 @@ retry:
}
}
-static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic, struct task_cputime *sum)
+static void update_gt_cputime(struct task_cputime_atomic *cputime_atomic,
+ struct task_cputime *sum)
{
__update_gt_cputime(&cputime_atomic->utime, sum->utime);
__update_gt_cputime(&cputime_atomic->stime, sum->stime);
__update_gt_cputime(&cputime_atomic->sum_exec_runtime, sum->sum_exec_runtime);
}
-/* Sample task_cputime_atomic values in "atomic_timers", store results in "times". */
-static inline void sample_cputime_atomic(struct task_cputime *times,
- struct task_cputime_atomic *atomic_times)
+/**
+ * thread_group_sample_cputime - Sample cputime for a given task
+ * @tsk: Task for which cputime needs to be started
+ * @iimes: Storage for time samples
+ *
+ * Called from sys_getitimer() to calculate the expiry time of an active
+ * timer. That means group cputime accounting is already active. Called
+ * with task sighand lock held.
+ *
+ * Updates @times with an uptodate sample of the thread group cputimes.
+ */
+void thread_group_sample_cputime(struct task_struct *tsk, u64 *samples)
{
- times->utime = atomic64_read(&atomic_times->utime);
- times->stime = atomic64_read(&atomic_times->stime);
- times->sum_exec_runtime = atomic64_read(&atomic_times->sum_exec_runtime);
+ struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+ struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
+
+ WARN_ON_ONCE(!pct->timers_active);
+
+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
}
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+/**
+ * thread_group_start_cputime - Start cputime and return a sample
+ * @tsk: Task for which cputime needs to be started
+ * @samples: Storage for time samples
+ *
+ * The thread group cputime accouting is avoided when there are no posix
+ * CPU timers armed. Before starting a timer it's required to check whether
+ * the time accounting is active. If not, a full update of the atomic
+ * accounting store needs to be done and the accounting enabled.
+ *
+ * Updates @times with an uptodate sample of the thread group cputimes.
+ */
+static void thread_group_start_cputime(struct task_struct *tsk, u64 *samples)
{
struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
- struct task_cputime sum;
+ struct posix_cputimers *pct = &tsk->signal->posix_cputimers;
/* Check if cputimer isn't running. This is accessed without locking. */
- if (!READ_ONCE(cputimer->running)) {
+ if (!READ_ONCE(pct->timers_active)) {
+ struct task_cputime sum;
+
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
@@ -225,94 +314,69 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
update_gt_cputime(&cputimer->cputime_atomic, &sum);
/*
- * We're setting cputimer->running without a lock. Ensure
- * this only gets written to in one operation. We set
- * running after update_gt_cputime() as a small optimization,
- * but barriers are not required because update_gt_cputime()
+ * We're setting timers_active without a lock. Ensure this
+ * only gets written to in one operation. We set it after
+ * update_gt_cputime() as a small optimization, but
+ * barriers are not required because update_gt_cputime()
* can handle concurrent updates.
*/
- WRITE_ONCE(cputimer->running, true);
+ WRITE_ONCE(pct->timers_active, true);
}
- sample_cputime_atomic(times, &cputimer->cputime_atomic);
+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
}
-/*
- * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with task sighand lock held for safe while_each_thread()
- * traversal.
- */
-static int cpu_clock_sample_group(const clockid_t which_clock,
- struct task_struct *p,
- u64 *sample)
+static void __thread_group_cputime(struct task_struct *tsk, u64 *samples)
{
- struct task_cputime cputime;
+ struct task_cputime ct;
- switch (CPUCLOCK_WHICH(which_clock)) {
- default:
- return -EINVAL;
- case CPUCLOCK_PROF:
- thread_group_cputime(p, &cputime);
- *sample = cputime.utime + cputime.stime;
- break;
- case CPUCLOCK_VIRT:
- thread_group_cputime(p, &cputime);
- *sample = cputime.utime;
- break;
- case CPUCLOCK_SCHED:
- thread_group_cputime(p, &cputime);
- *sample = cputime.sum_exec_runtime;
- break;
- }
- return 0;
+ thread_group_cputime(tsk, &ct);
+ store_samples(samples, ct.stime, ct.utime, ct.sum_exec_runtime);
}
-static int posix_cpu_clock_get_task(struct task_struct *tsk,
- const clockid_t which_clock,
- struct timespec64 *tp)
+/*
+ * Sample a process (thread group) clock for the given task clkid. If the
+ * group's cputime accounting is already enabled, read the atomic
+ * store. Otherwise a full update is required. Task's sighand lock must be
+ * held to protect the task traversal on a full update. clkid is already
+ * validated.
+ */
+static u64 cpu_clock_sample_group(const clockid_t clkid, struct task_struct *p,
+ bool start)
{
- int err = -EINVAL;
- u64 rtn;
+ struct thread_group_cputimer *cputimer = &p->signal->cputimer;
+ struct posix_cputimers *pct = &p->signal->posix_cputimers;
+ u64 samples[CPUCLOCK_MAX];
- if (CPUCLOCK_PERTHREAD(which_clock)) {
- if (same_thread_group(tsk, current))
- err = cpu_clock_sample(which_clock, tsk, &rtn);
+ if (!READ_ONCE(pct->timers_active)) {
+ if (start)
+ thread_group_start_cputime(p, samples);
+ else
+ __thread_group_cputime(p, samples);
} else {
- if (tsk == current || thread_group_leader(tsk))
- err = cpu_clock_sample_group(which_clock, tsk, &rtn);
+ proc_sample_cputime_atomic(&cputimer->cputime_atomic, samples);
}
- if (!err)
- *tp = ns_to_timespec64(rtn);
-
- return err;
+ return samples[clkid];
}
-
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
+static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp)
{
- const pid_t pid = CPUCLOCK_PID(which_clock);
- int err = -EINVAL;
+ const clockid_t clkid = CPUCLOCK_WHICH(clock);
+ struct task_struct *tsk;
+ u64 t;
- if (pid == 0) {
- /*
- * Special case constant value for our own clocks.
- * We don't have to do any lookup to find ourselves.
- */
- err = posix_cpu_clock_get_task(current, which_clock, tp);
- } else {
- /*
- * Find the given PID, and validate that the caller
- * should be able to see it.
- */
- struct task_struct *p;
- rcu_read_lock();
- p = find_task_by_vpid(pid);
- if (p)
- err = posix_cpu_clock_get_task(p, which_clock, tp);
- rcu_read_unlock();
- }
+ tsk = get_task_for_clock_get(clock);
+ if (!tsk)
+ return -EINVAL;
- return err;
+ if (CPUCLOCK_PERTHREAD(clock))
+ t = cpu_clock_sample(clkid, tsk);
+ else
+ t = cpu_clock_sample_group(clkid, tsk, false);
+ put_task_struct(tsk);
+
+ *tp = ns_to_timespec64(t);
+ return 0;
}
/*
@@ -322,44 +386,15 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *t
*/
static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
- int ret = 0;
- const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
- struct task_struct *p;
+ struct task_struct *p = get_task_for_clock(new_timer->it_clock);
- if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX)
+ if (!p)
return -EINVAL;
new_timer->kclock = &clock_posix_cpu;
-
- INIT_LIST_HEAD(&new_timer->it.cpu.entry);
-
- rcu_read_lock();
- if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
- if (pid == 0) {
- p = current;
- } else {
- p = find_task_by_vpid(pid);
- if (p && !same_thread_group(p, current))
- p = NULL;
- }
- } else {
- if (pid == 0) {
- p = current->group_leader;
- } else {
- p = find_task_by_vpid(pid);
- if (p && !has_group_leader_pid(p))
- p = NULL;
- }
- }
+ timerqueue_init(&new_timer->it.cpu.node);
new_timer->it.cpu.task = p;
- if (p) {
- get_task_struct(p);
- } else {
- ret = -EINVAL;
- }
- rcu_read_unlock();
-
- return ret;
+ return 0;
}
/*
@@ -370,12 +405,14 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
*/
static int posix_cpu_timer_del(struct k_itimer *timer)
{
- int ret = 0;
- unsigned long flags;
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ struct task_struct *p = ctmr->task;
struct sighand_struct *sighand;
- struct task_struct *p = timer->it.cpu.task;
+ unsigned long flags;
+ int ret = 0;
- WARN_ON_ONCE(p == NULL);
+ if (WARN_ON_ONCE(!p))
+ return -EINVAL;
/*
* Protect against sighand release/switch in exit/exec and process/
@@ -384,15 +421,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
sighand = lock_task_sighand(p, &flags);
if (unlikely(sighand == NULL)) {
/*
- * We raced with the reaping of the task.
- * The deletion should have cleared us off the list.
+ * This raced with the reaping of the task. The exit cleanup
+ * should have removed this timer from the timer queue.
*/
- WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
+ WARN_ON_ONCE(ctmr->head || timerqueue_node_queued(&ctmr->node));
} else {
if (timer->it.cpu.firing)
ret = TIMER_RETRY;
else
- list_del(&timer->it.cpu.entry);
+ cpu_timer_dequeue(ctmr);
unlock_task_sighand(p, &flags);
}
@@ -403,25 +440,30 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
return ret;
}
-static void cleanup_timers_list(struct list_head *head)
+static void cleanup_timerqueue(struct timerqueue_head *head)
{
- struct cpu_timer_list *timer, *next;
+ struct timerqueue_node *node;
+ struct cpu_timer *ctmr;
- list_for_each_entry_safe(timer, next, head, entry)
- list_del_init(&timer->entry);
+ while ((node = timerqueue_getnext(head))) {
+ timerqueue_del(head, node);
+ ctmr = container_of(node, struct cpu_timer, node);
+ ctmr->head = NULL;
+ }
}
/*
- * Clean out CPU timers still ticking when a thread exited. The task
- * pointer is cleared, and the expiry time is replaced with the residual
- * time for later timer_gettime calls to return.
+ * Clean out CPU timers which are still armed when a thread exits. The
+ * timers are only removed from the list. No other updates are done. The
+ * corresponding posix timers are still accessible, but cannot be rearmed.
+ *
* This must be called with the siglock held.
*/
-static void cleanup_timers(struct list_head *head)
+static void cleanup_timers(struct posix_cputimers *pct)
{
- cleanup_timers_list(head);
- cleanup_timers_list(++head);
- cleanup_timers_list(++head);
+ cleanup_timerqueue(&pct->bases[CPUCLOCK_PROF].tqhead);
+ cleanup_timerqueue(&pct->bases[CPUCLOCK_VIRT].tqhead);
+ cleanup_timerqueue(&pct->bases[CPUCLOCK_SCHED].tqhead);
}
/*
@@ -431,16 +473,11 @@ static void cleanup_timers(struct list_head *head)
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
- cleanup_timers(tsk->cpu_timers);
+ cleanup_timers(&tsk->posix_cputimers);
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
{
- cleanup_timers(tsk->signal->cpu_timers);
-}
-
-static inline int expires_gt(u64 expires, u64 new_exp)
-{
- return expires == 0 || expires > new_exp;
+ cleanup_timers(&tsk->signal->posix_cputimers);
}
/*
@@ -449,58 +486,33 @@ static inline int expires_gt(u64 expires, u64 new_exp)
*/
static void arm_timer(struct k_itimer *timer)
{
- struct task_struct *p = timer->it.cpu.task;
- struct list_head *head, *listpos;
- struct task_cputime *cputime_expires;
- struct cpu_timer_list *const nt = &timer->it.cpu;
- struct cpu_timer_list *next;
-
- if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
- head = p->cpu_timers;
- cputime_expires = &p->cputime_expires;
- } else {
- head = p->signal->cpu_timers;
- cputime_expires = &p->signal->cputime_expires;
- }
- head += CPUCLOCK_WHICH(timer->it_clock);
-
- listpos = head;
- list_for_each_entry(next, head, entry) {
- if (nt->expires < next->expires)
- break;
- listpos = &next->entry;
- }
- list_add(&nt->entry, listpos);
-
- if (listpos == head) {
- u64 exp = nt->expires;
+ int clkidx = CPUCLOCK_WHICH(timer->it_clock);
+ struct cpu_timer *ctmr = &timer->it.cpu;
+ u64 newexp = cpu_timer_getexpires(ctmr);
+ struct task_struct *p = ctmr->task;
+ struct posix_cputimer_base *base;
+
+ if (CPUCLOCK_PERTHREAD(timer->it_clock))
+ base = p->posix_cputimers.bases + clkidx;
+ else
+ base = p->signal->posix_cputimers.bases + clkidx;
+