From b62b6cf170237677ae96001940c54d5c09d0015a Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 3 Dec 2019 17:52:03 +0100 Subject: s390/spinlock: remove confusing comment in arch_spin_lock_wait arch_spin_lock_wait does not take steal time into consideration. Signed-off-by: Vasily Gorbik --- arch/s390/lib/spinlock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index ce1e4bbe53aa..9b2dab5a69f9 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -242,7 +242,6 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp) void arch_spin_lock_wait(arch_spinlock_t *lp) { - /* Use classic spinlocks + niai if the steal time is >= 10% */ if (test_cpu_flag(CIF_DEDICATED_CPU)) arch_spin_lock_queued(lp); else -- cgit v1.2.3 From 7e914fd17e9a750d8896e5645c84e75e52d77a4b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 2 Dec 2019 09:02:15 +0000 Subject: s390/test_unwind: fix spelling mistake "reqister" -> "register" There is a spelling mistake in a pr_info message. Fix it. Link: https://lkml.kernel.org/r/20191202090215.28766-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Vasily Gorbik --- arch/s390/lib/test_unwind.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index bda7ac0ddd29..32b7a30b2485 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -238,7 +238,7 @@ static int test_unwind_irq(struct unwindme *u) { preempt_disable(); if (register_external_irq(EXT_IRQ_CLK_COMP, unwindme_irq_handler)) { - pr_info("Couldn't reqister external interrupt handler"); + pr_info("Couldn't register external interrupt handler"); return -1; } u->task = current; -- cgit v1.2.3 From 39d4a501a9ef55c57b51e3ef07fc2aeed7f30b3b Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Thu, 28 Nov 2019 10:26:41 +0100 Subject: s390/cpum_sf: Adjust sampling interval to avoid hitting sample limits Function perf_event_ever_overflow() and perf_event_account_interrupt() are called every time samples are processed by the interrupt handler. However function perf_event_account_interrupt() has checks to avoid being flooded with interrupts (more then 1000 samples are received per task_tick). Samples are then dropped and a PERF_RECORD_THROTTLED is added to the perf data. The perf subsystem limit calculation is: maximum sample frequency := 100000 --> 1 samples per 10 us task_tick = 10ms = 10000us --> 1000 samples per task_tick The work flow is measurement_alert() uses SDBT head and each SBDT points to 511 SDB pages, each with 126 sample entries. After processing 8 SBDs and for each valid sample calling: perf_event_overflow() perf_event_account_interrupts() there is a considerable amount of samples being dropped, especially when the sample frequency is very high and near the 100000 limit. To avoid the high amount of samples being dropped near the end of a task_tick time frame, increment the sampling interval in case of dropped events. The CPU Measurement sampling facility on the s390 supports only intervals, specifiing how many CPU cycles have to be executed before a sample is generated. Increase the interval when the samples being generated hit the task_tick limit. Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_sf.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index c07fdcd73726..3f6b02ee73ec 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1315,6 +1315,22 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) if (sampl_overflow) OVERFLOW_REG(hwc) = DIV_ROUND_UP(OVERFLOW_REG(hwc) + sampl_overflow, 1 + num_sdb); + + /* Perf_event_overflow() and perf_event_account_interrupt() limit + * the interrupt rate to an upper limit. Roughly 1000 samples per + * task tick. + * Hitting this limit results in a large number + * of throttled REF_REPORT_THROTTLE entries and the samples + * are dropped. + * Slightly increase the interval to avoid hitting this limit. + */ + if (event_overflow) { + SAMPL_RATE(hwc) += DIV_ROUND_UP(SAMPL_RATE(hwc), 10); + debug_sprintf_event(sfdbg, 1, "%s: rate adjustment %ld\n", + __func__, + DIV_ROUND_UP(SAMPL_RATE(hwc), 10)); + } + if (sampl_overflow || event_overflow) debug_sprintf_event(sfdbg, 4, "%s: " "overflows: sample %llu event %llu" -- cgit v1.2.3 From 0539ad0b22877225095d8adef0c376f52cc23834 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Fri, 29 Nov 2019 15:24:25 +0100 Subject: s390/cpum_sf: Avoid SBD overflow condition in irq handler The s390 CPU Measurement sampling facility has an overflow condition which fires when all entries in a SBD are used. The measurement alert interrupt is triggered and reads out all samples in this SDB. It then tests the successor SDB, if this SBD is not full, the interrupt handler does not read any samples at all from this SDB The design waits for the hardware to fill this SBD and then trigger another meassurement alert interrupt. This scheme works nicely until an perf_event_overflow() function call discards the sample due to a too high sampling rate. The interrupt handler has logic to read out a partially filled SDB when the perf event overflow condition in linux common code is met. This causes the CPUM sampling measurement hardware and the PMU device driver to operate on the same SBD's trailer entry. This should not happen. This can be seen here using this trace: cpumsf_pmu_add: tear:0xb5286000 hw_perf_event_update: sdbt 0xb5286000 full 1 over 0 flush_all:0 hw_perf_event_update: sdbt 0xb5286008 full 0 over 0 flush_all:0 above shows 1. interrupt hw_perf_event_update: sdbt 0xb5286008 full 1 over 0 flush_all:0 hw_perf_event_update: sdbt 0xb5286008 full 0 over 0 flush_all:0 above shows 2. interrupt ... this goes on fine until... hw_perf_event_update: sdbt 0xb5286068 full 1 over 0 flush_all:0 perf_push_sample1: overflow one or more samples read from the IRQ handler are rejected by perf_event_overflow() and the IRQ handler advances to the next SDB and modifies the trailer entry of a partially filled SDB. hw_perf_event_update: sdbt 0xb5286070 full 0 over 0 flush_all:1 timestamp: 14:32:52.519953 Next time the IRQ handler is called for this SDB the trailer entry shows an overflow count of 19 missed entries. hw_perf_event_update: sdbt 0xb5286070 full 1 over 19 flush_all:1 timestamp: 14:32:52.970058 Remove access to a follow on SDB when event overflow happened. Signed-off-by: Thomas Richter Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_sf.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 3f6b02ee73ec..77d93c534284 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1303,12 +1303,6 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) */ if (flush_all && done) break; - - /* If an event overflow happened, discard samples by - * processing any remaining sample-data-blocks. - */ - if (event_overflow) - flush_all = 1; } /* Account sample overflows in the event hardware structure */ -- cgit v1.2.3 From 157309a97015d8b32d65847d8a6d493d1a958e93 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 6 Dec 2019 08:27:51 +0100 Subject: s390/uv: use EOPNOTSUPP instead of ENOTSUPP ENOTSUP is just an internal kernel error and should never reach userspace. The return value of the share function is not exported to userspace, but to avoid giving bad examples let us use EOPNOTSUPP: Suggested-by: Heiko Carstens Acked-by: Janosch Frank Signed-off-by: Christian Borntraeger Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/uv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index ef3c00b049ab..4093a2856929 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -86,7 +86,7 @@ static inline int share(unsigned long addr, u16 cmd) }; if (!is_prot_virt_guest()) - return -ENOTSUPP; + return -EOPNOTSUPP; /* * Sharing is page wise, if we encounter addresses that are * not page aligned, we assume something went wrong. If -- cgit v1.2.3 From 1b68ac8678a8e9993deebd55014cbe803e78ca02 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 29 Nov 2019 12:59:59 +0100 Subject: s390: remove last diag 0x44 caller diag 0x44 is a voluntary undirected yield of a virtual CPU. This has caused a lot of performance issues in the past. There is only one caller left, and that one is only executed if diag 0x9c (directed yield) is not present. Given that all hypervisors implement diag 0x9c anyway, remove the last diag 0x44 to avoid that more callers will be added. Worst case that could happen now, if diag 0x9c is not present, is that a virtual CPU would loop a bit instead of giving its time slice up. diag 0x44 statistics in debugfs are kept and will always be zero, so that user space can tell that there are no calls. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/setup.h | 2 -- arch/s390/kernel/early.c | 16 ---------------- arch/s390/kernel/smp.c | 13 +++++-------- 3 files changed, 5 insertions(+), 26 deletions(-) diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 6dc6c4fbc8e2..69289e99cabd 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -27,7 +27,6 @@ #define MACHINE_FLAG_DIAG9C BIT(3) #define MACHINE_FLAG_ESOP BIT(4) #define MACHINE_FLAG_IDTE BIT(5) -#define MACHINE_FLAG_DIAG44 BIT(6) #define MACHINE_FLAG_EDAT1 BIT(7) #define MACHINE_FLAG_EDAT2 BIT(8) #define MACHINE_FLAG_TOPOLOGY BIT(10) @@ -94,7 +93,6 @@ extern unsigned long __swsusp_reset_dma; #define MACHINE_HAS_DIAG9C (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG9C) #define MACHINE_HAS_ESOP (S390_lowcore.machine_flags & MACHINE_FLAG_ESOP) #define MACHINE_HAS_IDTE (S390_lowcore.machine_flags & MACHINE_FLAG_IDTE) -#define MACHINE_HAS_DIAG44 (S390_lowcore.machine_flags & MACHINE_FLAG_DIAG44) #define MACHINE_HAS_EDAT1 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT1) #define MACHINE_HAS_EDAT2 (S390_lowcore.machine_flags & MACHINE_FLAG_EDAT2) #define MACHINE_HAS_TOPOLOGY (S390_lowcore.machine_flags & MACHINE_FLAG_TOPOLOGY) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index db32a55daaec..cd241ee66eff 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -204,21 +204,6 @@ static __init void detect_diag9c(void) S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG9C; } -static __init void detect_diag44(void) -{ - int rc; - - diag_stat_inc(DIAG_STAT_X044); - asm volatile( - " diag 0,0,0x44\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc) : "0" (-EOPNOTSUPP) : "cc"); - if (!rc) - S390_lowcore.machine_flags |= MACHINE_FLAG_DIAG44; -} - static __init void detect_machine_facilities(void) { if (test_facility(8)) { @@ -331,7 +316,6 @@ void __init startup_init(void) setup_arch_string(); setup_boot_command_line(); detect_diag9c(); - detect_diag44(); detect_machine_facilities(); save_vector_registers(); setup_topology(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2794cad9312e..a08bd2522dd9 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -413,14 +413,11 @@ EXPORT_SYMBOL(arch_vcpu_is_preempted); void smp_yield_cpu(int cpu) { - if (MACHINE_HAS_DIAG9C) { - diag_stat_inc_norecursion(DIAG_STAT_X09C); - asm volatile("diag %0,0,0x9c" - : : "d" (pcpu_devices[cpu].address)); - } else if (MACHINE_HAS_DIAG44 && !smp_cpu_mtid) { - diag_stat_inc_norecursion(DIAG_STAT_X044); - asm volatile("diag 0,0,0x44"); - } + if (!MACHINE_HAS_DIAG9C) + return; + diag_stat_inc_norecursion(DIAG_STAT_X09C); + asm volatile("diag %0,0,0x9c" + : : "d" (pcpu_devices[cpu].address)); } /* -- cgit v1.2.3 From 3e39ce266a273e5751e5792aee248cb8d9630c8b Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 2 Aug 2019 12:42:59 +0200 Subject: s390/kasan: add KASAN_VMALLOC support Add KASAN_VMALLOC support which now enables vmalloc memory area access checks as well as enables usage of VMAP_STACK under kasan. KASAN_VMALLOC changes the way vmalloc and modules areas shadow memory is handled. With this new approach only top level page tables are pre-populated and lower levels are filled dynamically upon memory allocation. Acked-by: Ilya Leoshkevich Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 + arch/s390/mm/kasan_init.c | 68 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d4051e88e625..bc88841d335d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -124,6 +124,7 @@ config S390 select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN + select HAVE_ARCH_KASAN_VMALLOC select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 460f25572940..06345616a646 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -82,7 +82,8 @@ static pte_t * __init kasan_early_pte_alloc(void) enum populate_mode { POPULATE_ONE2ONE, POPULATE_MAP, - POPULATE_ZERO_SHADOW + POPULATE_ZERO_SHADOW, + POPULATE_SHALLOW }; static void __init kasan_early_vmemmap_populate(unsigned long address, unsigned long end, @@ -116,6 +117,12 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, pgd_populate(&init_mm, pg_dir, p4_dir); } + if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) && + mode == POPULATE_SHALLOW) { + address = (address + P4D_SIZE) & P4D_MASK; + continue; + } + p4_dir = p4d_offset(pg_dir, address); if (p4d_none(*p4_dir)) { if (mode == POPULATE_ZERO_SHADOW && @@ -130,6 +137,12 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, p4d_populate(&init_mm, p4_dir, pu_dir); } + if (!IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) && + mode == POPULATE_SHALLOW) { + address = (address + PUD_SIZE) & PUD_MASK; + continue; + } + pu_dir = pud_offset(p4_dir, address); if (pud_none(*pu_dir)) { if (mode == POPULATE_ZERO_SHADOW && @@ -195,6 +208,9 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, page = kasan_early_shadow_page; pte_val(*pt_dir) = __pa(page) | pgt_prot_zero; break; + case POPULATE_SHALLOW: + /* should never happen */ + break; } } address += PAGE_SIZE; @@ -313,22 +329,50 @@ void __init kasan_early_init(void) init_mm.pgd = early_pg_dir; /* * Current memory layout: - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * +- end of ram ----+ / +----------------+ - * | ... gap ... |/ | kasan | - * +- shadow start --+ | zero | - * | 1/8 addr space | | page | - * +- shadow end -+ | mapping | - * | ... gap ... |\ | (untracked) | - * +- modules vaddr -+ \ +----------------+ - * | 2Gb | \| unmapped | allocated per module - * +-----------------+ +- shadow end ---+ + * +- 0 -------------+ +- shadow start -+ + * | 1:1 ram mapping | /| 1/8 ram | + * | | / | | + * +- end of ram ----+ / +----------------+ + * | ... gap ... | / | | + * | |/ | kasan | + * +- shadow start --+ | zero | + * | 1/8 addr space | | page | + * +- shadow end -+ | mapping | + * | ... gap ... |\ | (untracked) | + * +- vmalloc area -+ \ | | + * | vmalloc_size | \ | | + * +- modules vaddr -+ \ +----------------+ + * | 2Gb | \| unmapped | allocated per module + * +-----------------+ +- shadow end ---+ + * + * Current memory layout (KASAN_VMALLOC): + * +- 0 -------------+ +- shadow start -+ + * | 1:1 ram mapping | /| 1/8 ram | + * | | / | | + * +- end of ram ----+ / +----------------+ + * | ... gap ... | / | kasan | + * | |/ | zero | + * +- shadow start --+ | page | + * | 1/8 addr space | | mapping | + * +- shadow end -+ | (untracked) | + * | ... gap ... |\ | | + * +- vmalloc area -+ \ +- vmalloc area -+ + * | vmalloc_size | \ |shallow populate| + * +- modules vaddr -+ \ +- modules area -+ + * | 2Gb | \|shallow populate| + * +-----------------+ +- shadow end ---+ */ /* populate kasan shadow (for identity mapping and zero page mapping) */ kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP); if (IS_ENABLED(CONFIG_MODULES)) untracked_mem_end = vmax - MODULES_LEN; + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + untracked_mem_end = vmax - vmalloc_size - MODULES_LEN; + /* shallowly populate kasan shadow for vmalloc and modules */ + kasan_early_vmemmap_populate(__sha(untracked_mem_end), + __sha(vmax), POPULATE_SHALLOW); + } + /* populate kasan shadow for untracked memory */ kasan_early_vmemmap_populate(__sha(max_physmem_end), __sha(untracked_mem_end), POPULATE_ZERO_SHADOW); -- cgit v1.2.3