summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig33
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/compressed/eboot.c40
-rw-r--r--arch/x86/boot/compressed/head_32.S6
-rw-r--r--arch/x86/boot/compressed/head_64.S8
-rw-r--r--arch/x86/configs/kvm_guest.config31
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/entry/common.c24
-rw-r--r--arch/x86/entry/entry_32.S70
-rw-r--r--arch/x86/entry/entry_64.S159
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl3
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl3
-rw-r--r--arch/x86/entry/syscalls/syscalltbl.sh15
-rw-r--r--arch/x86/entry/thunk_32.S3
-rw-r--r--arch/x86/entry/thunk_64.S3
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c2
-rw-r--r--arch/x86/entry/vdso/vdso2c.h2
-rw-r--r--arch/x86/entry/vdso/vma.c175
-rw-r--r--arch/x86/events/core.c60
-rw-r--r--arch/x86/events/intel/core.c34
-rw-r--r--arch/x86/events/intel/ds.c108
-rw-r--r--arch/x86/events/intel/lbr.c74
-rw-r--r--arch/x86/events/intel/pt.c24
-rw-r--r--arch/x86/events/intel/pt.h5
-rw-r--r--arch/x86/events/intel/rapl.c5
-rw-r--r--arch/x86/events/intel/uncore.c12
-rw-r--r--arch/x86/events/intel/uncore.h7
-rw-r--r--arch/x86/events/intel/uncore_snb.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c626
-rw-r--r--arch/x86/events/perf_event.h13
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/alternative.h8
-rw-r--r--arch/x86/include/asm/apic.h5
-rw-r--r--arch/x86/include/asm/cacheflush.h1
-rw-r--r--arch/x86/include/asm/cmpxchg.h44
-rw-r--r--arch/x86/include/asm/compat.h8
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/e820.h6
-rw-r--r--arch/x86/include/asm/efi.h29
-rw-r--r--arch/x86/include/asm/export.h4
-rw-r--r--arch/x86/include/asm/extable.h35
-rw-r--r--arch/x86/include/asm/fpu/signal.h6
-rw-r--r--arch/x86/include/asm/fpu/xstate.h10
-rw-r--r--arch/x86/include/asm/ftrace.h3
-rw-r--r--arch/x86/include/asm/hypervisor.h4
-rw-r--r--arch/x86/include/asm/intel-family.h5
-rw-r--r--arch/x86/include/asm/intel-mid.h2
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h2
-rw-r--r--arch/x86/include/asm/irq.h5
-rw-r--r--arch/x86/include/asm/irqflags.h12
-rw-r--r--arch/x86/include/asm/kaslr.h1
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/kexec.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h78
-rw-r--r--arch/x86/include/asm/mce.h66
-rw-r--r--arch/x86/include/asm/mmu.h8
-rw-r--r--arch/x86/include/asm/mmu_context.h25
-rw-r--r--arch/x86/include/asm/mpspec.h3
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/paravirt.h22
-rw-r--r--arch/x86/include/asm/paravirt_types.h8
-rw-r--r--arch/x86/include/asm/pci.h14
-rw-r--r--arch/x86/include/asm/percpu.h5
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h2
-rw-r--r--arch/x86/include/asm/pkeys.h73
-rw-r--r--arch/x86/include/asm/pmem.h5
-rw-r--r--arch/x86/include/asm/processor.h25
-rw-r--r--arch/x86/include/asm/pvclock.h5
-rw-r--r--arch/x86/include/asm/realmode.h2
-rw-r--r--arch/x86/include/asm/rwsem.h8
-rw-r--r--arch/x86/include/asm/sections.h2
-rw-r--r--arch/x86/include/asm/signal.h4
-rw-r--r--arch/x86/include/asm/smp.h4
-rw-r--r--arch/x86/include/asm/special_insns.h22
-rw-r--r--arch/x86/include/asm/spinlock.h174
-rw-r--r--arch/x86/include/asm/spinlock_types.h13
-rw-r--r--arch/x86/include/asm/stacktrace.h120
-rw-r--r--arch/x86/include/asm/string_64.h19
-rw-r--r--arch/x86/include/asm/switch_to.h164
-rw-r--r--arch/x86/include/asm/syscall.h20
-rw-r--r--arch/x86/include/asm/thread_info.h64
-rw-r--r--arch/x86/include/asm/traps.h6
-rw-r--r--arch/x86/include/asm/uaccess.h32
-rw-r--r--arch/x86/include/asm/unwind.h63
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h45
-rw-r--r--arch/x86/include/asm/vdso.h2
-rw-r--r--arch/x86/include/asm/xen/events.h11
-rw-r--r--arch/x86/include/uapi/asm/mce.h2
-rw-r--r--arch/x86/include/uapi/asm/prctl.h6
-rw-r--r--arch/x86/kernel/Makefile11
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c39
-rw-r--r--arch/x86/kernel/acpi/cppc_msr.c58
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apic/apic.c105
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c22
-rw-r--r--arch/x86/kernel/apic/apic_noop.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c5
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c18
-rw-r--r--arch/x86/kernel/apic/io_apic.c4
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c6
-rw-r--r--arch/x86/kernel/apic/vector.c23
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c40
-rw-r--r--arch/x86/kernel/asm-offsets.c7
-rw-r--r--arch/x86/kernel/asm-offsets_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c5
-rw-r--r--arch/x86/kernel/cpu/common.c41
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c44
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c204
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c52
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/vmware.c17
-rw-r--r--arch/x86/kernel/crash.c22
-rw-r--r--arch/x86/kernel/dumpstack.c258
-rw-r--r--arch/x86/kernel/dumpstack_32.c154
-rw-r--r--arch/x86/kernel/dumpstack_64.c318
-rw-r--r--arch/x86/kernel/e820.c153
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/fpu/core.c4
-rw-r--r--arch/x86/kernel/fpu/init.c1
-rw-r--r--arch/x86/kernel/fpu/xstate.c7
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head_32.S10
-rw-r--r--arch/x86/kernel/head_64.S15
-rw-r--r--arch/x86/kernel/hpet.c94
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c47
-rw-r--r--arch/x86/kernel/irq_64.c3
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c4
-rw-r--r--arch/x86/kernel/kgdb.c8
-rw-r--r--arch/x86/kernel/kprobes/core.c13
-rw-r--r--arch/x86/kernel/kprobes/opt.c2
-rw-r--r--arch/x86/kernel/ksysfs.c2
-rw-r--r--arch/x86/kernel/kvm.c288
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/livepatch.c65
-rw-r--r--arch/x86/kernel/machine_kexec_64.c3
-rw-r--r--arch/x86/kernel/mcount_64.S2
-rw-r--r--arch/x86/kernel/mpparse.c3
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c7
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c4
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c4
-rw-r--r--arch/x86/kernel/process.c49
-rw-r--r--arch/x86/kernel/process_32.c33
-rw-r--r--arch/x86/kernel/process_64.c65
-rw-r--r--arch/x86/kernel/ptrace.c14
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/quirks.c31
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/resource.c4
-rw-r--r--arch/x86/kernel/setup.c37
-rw-r--r--arch/x86/kernel/setup_percpu.c4
-rw-r--r--arch/x86/kernel/signal.c22
-rw-r--r--arch/x86/kernel/signal_compat.c31
-rw-r--r--arch/x86/kernel/smp.c7
-rw-r--r--arch/x86/kernel/smpboot.c93
-rw-r--r--arch/x86/kernel/stacktrace.c79
-rw-r--r--arch/x86/kernel/step.c3
-rw-r--r--arch/x86/kernel/sys_x86_64.c5
-rw-r--r--arch/x86/kernel/tboot.c8
-rw-r--r--arch/x86/kernel/traps.c61
-rw-r--r--arch/x86/kernel/tsc.c12
-rw-r--r--arch/x86/kernel/unwind_frame.c93
-rw-r--r--arch/x86/kernel/unwind_guess.c53
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c85
-rw-r--r--arch/x86/kernel/x86_init.c6
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/debugfs.c69
-rw-r--r--arch/x86/kvm/hyperv.c157
-rw-r--r--arch/x86/kvm/hyperv.h3
-rw-r--r--arch/x86/kvm/i8254.c14
-rw-r--r--arch/x86/kvm/ioapic.c2
-rw-r--r--arch/x86/kvm/lapic.c5
-rw-r--r--arch/x86/kvm/mmu.c12
-rw-r--r--arch/x86/kvm/svm.c419
-rw-r--r--arch/x86/kvm/vmx.c209
-rw-r--r--arch/x86/kvm/x86.c175
-rw-r--r--arch/x86/kvm/x86.h6
-rw-r--r--arch/x86/lib/checksum_32.S3
-rw-r--r--arch/x86/lib/clear_page_64.S2
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S2
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S8
-rw-r--r--arch/x86/lib/csum-partial_64.c1
-rw-r--r--arch/x86/lib/getuser.S5
-rw-r--r--arch/x86/lib/hweight.S3
-rw-r--r--arch/x86/lib/memcpy_64.S10
-rw-r--r--arch/x86/lib/memmove_64.S3
-rw-r--r--arch/x86/lib/memset_64.S3
-rw-r--r--arch/x86/lib/putuser.S5
-rw-r--r--arch/x86/lib/strstr_32.c3
-rw-r--r--arch/x86/mm/amdtopology.c22
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c43
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/kaslr.c26
-rw-r--r--arch/x86/mm/mpx.c5
-rw-r--r--arch/x86/mm/numa.c27
-rw-r--r--arch/x86/mm/pat_rbtree.c4
-rw-r--r--arch/x86/mm/pkeys.c142
-rw-r--r--arch/x86/mm/tlb.c15
-rw-r--r--arch/x86/oprofile/backtrace.c49
-rw-r--r--arch/x86/pci/Makefile2
-rw-r--r--arch/x86/pci/common.c7
-rw-r--r--arch/x86/pci/pcbios.c7
-rw-r--r--arch/x86/pci/vmd.c771
-rw-r--r--arch/x86/pci/xen.c2
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c2
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c13
-rw-r--r--arch/x86/platform/efi/efi.c200
-rw-r--r--arch/x86/platform/efi/efi_64.c23
-rw-r--r--arch/x86/platform/efi/quirks.c127
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c95
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c47
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c5
-rw-r--r--arch/x86/platform/intel-mid/pwr.c76
-rw-r--r--arch/x86/platform/mellanox/Makefile1
-rw-r--r--arch/x86/platform/mellanox/mlx-platform.c266
-rw-r--r--arch/x86/platform/uv/bios_uv.c17
-rw-r--r--arch/x86/platform/uv/tlb_uv.c184
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/ras/mce_amd_inj.c54
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/checksum_32.S2
-rw-r--r--arch/x86/um/ksyms.c13
-rw-r--r--arch/x86/um/ptrace_32.c11
-rw-r--r--arch/x86/um/ptrace_64.c3
-rw-r--r--arch/x86/xen/enlighten.c135
-rw-r--r--arch/x86/xen/grant-table.c2
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/pmu.c7
-rw-r--r--arch/x86/xen/setup.c2
-rw-r--r--arch/x86/xen/smp.c53
-rw-r--r--arch/x86/xen/smp.h13
-rw-r--r--arch/x86/xen/spinlock.c250
-rw-r--r--arch/x86/xen/time.c5
251 files changed, 5625 insertions, 4168 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2a1f0ce7c59a..bada636d1065 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,11 +23,11 @@ config X86
select ARCH_CLOCKSOURCE_DATA
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
- select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_GIGANTIC_PAGE if X86_64
select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_MMIO_FLUSH
@@ -93,6 +93,7 @@ config X86
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_EBPF_JIT if X86_64
+ select HAVE_ARCH_VMAP_STACK if X86_64
select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
@@ -109,7 +110,6 @@ config X86
select HAVE_EXIT_THREAD
select HAVE_FENTRY if X86_64
select HAVE_FTRACE_MCOUNT_RECORD
- select HAVE_FUNCTION_GRAPH_FP_TEST
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
@@ -157,6 +157,7 @@ config X86
select SPARSE_IRQ
select SRCU
select SYSCTL_EXCEPTION_TRACE
+ select THREAD_INFO_IN_TASK
select USER_STACKTRACE_SUPPORT
select VIRT_TO_BUS
select X86_DEV_DMA_OPS if X86_64
@@ -549,6 +550,18 @@ config X86_INTEL_QUARK
Say Y here if you have a Quark based system such as the Arduino
compatible Intel Galileo.
+config MLX_PLATFORM
+ tristate "Mellanox Technologies platform support"
+ depends on X86_64
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
+ This option enables system support for the Mellanox Technologies
+ platform.
+
+ Say Y here if you are building a kernel for Mellanox system.
+
+ Otherwise, say N.
+
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
depends on X86 && ACPI
@@ -705,7 +718,6 @@ config PARAVIRT_DEBUG
config PARAVIRT_SPINLOCKS
bool "Paravirtualization layer for spinlocks"
depends on PARAVIRT && SMP
- select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
---help---
Paravirtualized spinlocks allow a pvops backend to replace the
spinlock implementation with something virtualization-friendly
@@ -718,7 +730,7 @@ config PARAVIRT_SPINLOCKS
config QUEUED_LOCK_STAT
bool "Paravirt queued spinlock statistics"
- depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+ depends on PARAVIRT_SPINLOCKS && DEBUG_FS
---help---
Enable the collection of statistical data on the slowpath
behavior of paravirtualized queued spinlocks and report
@@ -2745,19 +2757,6 @@ config PMC_ATOM
def_bool y
depends on PCI
-config VMD
- depends on PCI_MSI
- tristate "Volume Management Device Driver"
- default N
- ---help---
- Adds support for the Intel Volume Management Device (VMD). VMD is a
- secondary PCI host bridge that allows PCI Express root ports,
- and devices attached to them, to be removed from the default
- PCI domain and placed within the VMD domain. This provides
- more bus resources than are otherwise possible with a
- single domain. If you know your system provides one of these and
- has devices attached to it, say Y; if you are not sure, say N.
-
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 830ed391e7ef..2d449337a360 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -163,11 +163,12 @@ asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
+avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 94dd4a31f5b3..cc69e37548db 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -29,22 +29,11 @@ __pure const struct efi_config *__efi_early(void)
static void setup_boot_services##bits(struct efi_config *c) \
{ \
efi_system_table_##bits##_t *table; \
- efi_boot_services_##bits##_t *bt; \
\
table = (typeof(table))sys_table; \
\
+ c->boot_services = table->boottime; \
c->text_output = table->con_out; \
- \
- bt = (typeof(bt))(unsigned long)(table->boottime); \
- \
- c->allocate_pool = bt->allocate_pool; \
- c->allocate_pages = bt->allocate_pages; \
- c->get_memory_map = bt->get_memory_map; \
- c->free_pool = bt->free_pool; \
- c->free_pages = bt->free_pages; \
- c->locate_handle = bt->locate_handle; \
- c->handle_protocol = bt->handle_protocol; \
- c->exit_boot_services = bt->exit_boot_services; \
}
BOOT_SERVICES(32);
BOOT_SERVICES(64);
@@ -286,29 +275,6 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
}
}
-static void find_bits(unsigned long mask, u8 *pos, u8 *size)
-{
- u8 first, len;
-
- first = 0;
- len = 0;
-
- if (mask) {
- while (!(mask & 0x1)) {
- mask = mask >> 1;
- first++;
- }
-
- while (mask & 0x1) {
- mask = mask >> 1;
- len++;
- }
- }
-
- *pos = first;
- *size = len;
-}
-
static efi_status_t
__setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
{
@@ -578,7 +544,7 @@ setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
unsigned long nr_ugas;
u32 *handles = (u32 *)uga_handle;;
- efi_status_t status;
+ efi_status_t status = EFI_INVALID_PARAMETER;
int i;
first_uga = NULL;
@@ -623,7 +589,7 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height)
efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
unsigned long nr_ugas;
u64 *handles = (u64 *)uga_handle;;
- efi_status_t status;
+ efi_status_t status = EFI_INVALID_PARAMETER;
int i;
first_uga = NULL;
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1038524270e7..fd0b6a272dd5 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -82,7 +82,7 @@ ENTRY(efi_pe_entry)
/* Relocate efi_config->call() */
leal efi32_config(%esi), %eax
- add %esi, 88(%eax)
+ add %esi, 32(%eax)
pushl %eax
call make_boot_params
@@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry)
/* Relocate efi_config->call() */
leal efi32_config(%esi), %eax
- add %esi, 88(%eax)
+ add %esi, 32(%eax)
pushl %eax
2:
call efi_main
@@ -264,7 +264,7 @@ relocated:
#ifdef CONFIG_EFI_STUB
.data
efi32_config:
- .fill 11,8,0
+ .fill 4,8,0
.long efi_call_phys
.long 0
.byte 0
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 0d80a7ad65cd..efdfba21a5b2 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -265,7 +265,7 @@ ENTRY(efi_pe_entry)
/*
* Relocate efi_config->call().
*/
- addq %rbp, efi64_config+88(%rip)
+ addq %rbp, efi64_config+32(%rip)
movq %rax, %rdi
call make_boot_params
@@ -285,7 +285,7 @@ handover_entry:
* Relocate efi_config->call().
*/
movq efi_config(%rip), %rax
- addq %rbp, 88(%rax)
+ addq %rbp, 32(%rax)
2:
movq efi_config(%rip), %rdi
call efi_main
@@ -457,14 +457,14 @@ efi_config:
#ifdef CONFIG_EFI_MIXED
.global efi32_config
efi32_config:
- .fill 11,8,0
+ .fill 4,8,0
.quad efi64_thunk
.byte 0
#endif
.global efi64_config
efi64_config:
- .fill 11,8,0
+ .fill 4,8,0
.quad efi_call
.byte 1
#endif /* CONFIG_EFI_STUB */
diff --git a/arch/x86/configs/kvm_guest.config b/arch/x86/configs/kvm_guest.config
deleted file mode 100644
index 9906505c998a..000000000000
--- a/arch/x86/configs/kvm_guest.config
+++ /dev/null
@@ -1,31 +0,0 @@
-CONFIG_NET=y
-CONFIG_NET_CORE=y
-CONFIG_NETDEVICES=y
-CONFIG_BLOCK=y
-CONFIG_BLK_DEV=y
-CONFIG_NETWORK_FILESYSTEMS=y
-CONFIG_INET=y
-CONFIG_TTY=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_IP_PNP=y
-CONFIG_IP_PNP_DHCP=y
-CONFIG_BINFMT_ELF=y
-CONFIG_PCI=y
-CONFIG_PCI_MSI=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_VIRTUALIZATION=y
-CONFIG_HYPERVISOR_GUEST=y
-CONFIG_PARAVIRT=y
-CONFIG_KVM_GUEST=y
-CONFIG_VIRTIO=y
-CONFIG_VIRTIO_PCI=y
-CONFIG_VIRTIO_BLK=y
-CONFIG_VIRTIO_CONSOLE=y
-CONFIG_VIRTIO_NET=y
-CONFIG_9P_FS=y
-CONFIG_NET_9P=y
-CONFIG_NET_9P_VIRTIO=y
-CONFIG_SCSI_LOWLEVEL=y
-CONFIG_SCSI_VIRTIO=y
-CONFIG_VIRTIO_INPUT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index d28bdabcc87e..7ef4a099defc 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -255,7 +255,6 @@ CONFIG_RTC_CLASS=y
CONFIG_DMADEVICES=y
CONFIG_EEEPC_LAPTOP=y
CONFIG_AMD_IOMMU=y
-CONFIG_AMD_IOMMU_STATS=y
CONFIG_INTEL_IOMMU=y
# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set
CONFIG_EFI_VARS=y
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1433f6b4607d..bdd9cc59d20f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,13 +31,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
-{
- unsigned long top_of_stack =
- (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
- return (struct thread_info *)(top_of_stack - THREAD_SIZE);
-}
-
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible inline void enter_from_user_mode(void)
@@ -71,7 +64,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
{
u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned long ret = 0;
bool emulated = false;
u32 work;
@@ -173,18 +166,17 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
/* Disable IRQs and retry */
local_irq_disable();
- cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+ cached_flags = READ_ONCE(current_thread_info()->flags);
if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
break;
-
}
}
/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
u32 cached_flags;
if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -209,7 +201,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
* special case only applies after poking regs and before the
* very next return to user mode.
*/
- ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
+ current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
#endif
user_enter_irqoff();
@@ -247,7 +239,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
*/
__visible inline void syscall_return_slowpath(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
u32 cached_flags = READ_ONCE(ti->flags);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -270,7 +262,7 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
#ifdef CONFIG_X86_64
__visible void do_syscall_64(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned long nr = regs->orig_ax;
enter_from_user_mode();
@@ -303,11 +295,11 @@ __visible void do_syscall_64(struct pt_regs *regs)
*/
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned int nr = (unsigned int)regs->orig_ax;
#ifdef CONFIG_IA32_EMULATION
- ti->status |= TS_COMPAT;
+ current->thread.status |= TS_COMPAT;
#endif
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 0b56666e6039..21b352a11b49 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -44,6 +44,7 @@
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/export.h>
.section .entry.text, "ax"
@@ -204,34 +205,70 @@
POP_GS_EX
.endm
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+ENTRY(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in struct inactive_task_frame
+ */
+ pushl %ebp
+ pushl %ebx
+ pushl %edi
+ pushl %esi
+
+ /* switch stack */
+ movl %esp, TASK_threadsp(%eax)
+ movl TASK_threadsp(%edx), %esp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ movl TASK_stack_canary(%edx), %ebx
+ movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+#endif
+
+ /* restore callee-saved registers */
+ popl %esi
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * eax: prev task we switched from
+ * ebx: kernel thread func (NULL for user thread)
+ * edi: kernel thread arg
+ */
ENTRY(ret_from_fork)
pushl %eax
call schedule_tail
popl %eax
+ testl %ebx, %ebx
+ jnz 1f /* kernel threads are uncommon */
+
+2:
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
call syscall_return_slowpath
jmp restore_all
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
- pushl %eax
- call schedule_tail
- popl %eax
- movl PT_EBP(%esp), %eax
- call *PT_EBX(%esp)
- movl $0, PT_EAX(%esp)
+ /* kernel thread */
+1: movl %edi, %eax
+ call *%ebx
/*
- * Kernel threads return to userspace as if returning from a syscall.
- * We should check whether anything actually uses this path and, if so,
- * consider switching it over to ret_from_fork.
+ * A kernel thread is allowed to return here after successfully
+ * calling do_execve(). Exit to userspace to complete the execve()
+ * syscall.
*/
- movl %esp, %eax
- call syscall_return_slowpath
- jmp restore_all
-ENDPROC(ret_from_kernel_thread)
+ movl $0, PT_EAX(%esp)
+ jmp 2b
+END(ret_from_fork)
/*
* Return to user mode is not as complex as all this looks,
@@ -955,6 +992,7 @@ trace:
jmp ftrace_stub
END(mcount)
#endif /* CONFIG_DYNAMIC_FTRACE */
+EXPORT_SYMBOL(mcount)
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d172c619c449..ef766a358b37 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -35,6 +35,7 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
+#include <asm/export.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -179,7 +180,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
*/
- testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ movq PER_CPU_VAR(current_task), %r11
+ testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath:
@@ -217,7 +219,8 @@ entry_SYSCALL_64_fastpath:
*/
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ movq PER_CPU_VAR(current_task), %r11
+ testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
jnz 1f
LOCKDEP_SYS_EXIT
@@ -351,8 +354,7 @@ ENTRY(stub_ptregs_64)
jmp entry_SYSCALL64_slow_path
1:
- /* Called from C */
- jmp *%rax /* called from C */
+ jmp *%rax /* Called from C */
END(stub_ptregs_64)
.macro ptregs_stub func
@@ -369,41 +371,73 @@ END(ptregs_\func)
#include <asm/syscalls_64.h>
/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+ENTRY(__switch_to_asm)
+ /*
+ * Save callee-saved registers
+ * This must match the order in inactive_task_frame
+ */
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ /* switch stack */
+ movq %rsp, TASK_threadsp(%rdi)
+ movq TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+ movq TASK_stack_canary(%rsi), %rbx
+ movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
+#endif
+
+ /* restore callee-saved registers */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbx
+ popq %rbp
+
+ jmp __switch_to
+END(__switch_to_asm)
+
+/*
* A newly forked process directly context switches into this address.
*
- * rdi: prev task we switched from
+ * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
*/
ENTRY(ret_from_fork)
- LOCK ; btr $TIF_FORK, TI_flags(%r8)
-
+ movq %rax, %rdi
call schedule_tail /* rdi: 'prev' task parameter */
- testb $3, CS(%rsp) /* from kernel_thread? */
- jnz 1f
-
- /*
- * We came from kernel_thread. This code path is quite twisted, and
- * someone should clean it up.
- *
- * copy_thread_tls stashes the function pointer in RBX and the
- * parameter to be passed in RBP. The called function is permitted
- * to call do_execve and thereby jump to user mode.
- */
- movq RBP(%rsp), %rdi
- call *RBX(%rsp)
- movl $0, RAX(%rsp)
+ testq %rbx, %rbx /* from kernel_thread? */
+ jnz 1f /* kernel threads are uncommon */
- /*
- * Fall through as though we're exiting a syscall. This makes a
- * twisted sort of sense if we just called do_execve.
- */
-
-1:
+2:
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
TRACE_IRQS_ON /* user mode is traced as IRQS on */
SWAPGS
jmp restore_regs_and_iret
+
+1:
+ /* kernel thread */
+ movq %r12, %rdi
+ call *%rbx
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling do_execve(). Exit to userspace to complete the execve()
+ * syscall.
+ */
+ movq $0, RAX(%rsp)
+ jmp 2b
END(ret_from_fork)
/*
@@ -555,27 +589,69 @@ native_irq_return_iret:
#ifdef CONFIG_X86_ESPFIX64
native_irq_return_ldt:
- pushq %rax
- pushq %rdi
+ /*
+ * We are running with user GSBASE. All GPRs contain their user
+ * values. We have a percpu ESPFIX stack that is eight slots
+ * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom
+ * of the ESPFIX stack.
+ *
+ * We clobber RAX and RDI in this code. We stash RDI on the
+ * normal stack and RAX on the ESPFIX stack.
+ *
+ * The ESPFIX stack layout we set up looks like this:
+ *
+ * --- top of ESPFIX stack ---
+ * SS
+ * RSP
+ * RFLAGS
+ * CS
+ * RIP <-- RSP points here when we're done
+ * RAX <-- espfix_waddr points here
+ * --- bottom of ESPFIX stack ---
+ */
+
+ pushq %rdi /* Stash user RDI */
SWAPGS
movq PER_CPU_VAR(espfix_waddr), %rdi
- movq %rax, (0*8)(%rdi) /* RAX */
- movq (2*8)(%rsp), %rax /* RIP */
+ movq %rax, (0*8)(%rdi) /* user RAX */
+ movq (1*8)(%rsp), %rax /* user RIP */
movq %rax, (1*8)(%rdi)
- movq (3*8)(%rsp), %rax /* CS */
+ movq (2*8)(%rsp), %rax /* user CS */
movq %rax, (2*8)(%rdi)
- movq (4*8)(%rsp), %rax /* RFLAGS */
+ movq (3*8)(%rsp), %rax /* user RFLAGS */
movq %rax, (3*8)(%rdi)
- movq (6*8)(%rsp), %rax /* SS */
+ movq (5*8)(%rsp), %rax /* user SS */
movq %rax, (5*8)(%rdi)
- movq (5*8)(%rsp), %rax /* RSP */
+ movq (4*8)(%rsp), %rax /* user RSP */
movq %rax, (4*8)(%rdi)
- andl $0xffff0000, %eax
- popq %rdi
+ /* Now RAX == RSP. */
+
+ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
+ popq %rdi /* Restore user RDI */
+
+ /*
+ * espfix_stack[31:16] == 0. The page tables are set up such that
+ * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
+ * espfix_waddr for any X. That is, there are 65536 RO aliases of
+ * the same page. Set up RSP so that RSP[31:16] contains the
+ * respective 16 bits of the /userspace/ RSP and RSP nonetheless
+ * still points to an RO alias of the ESPFIX stack.
+ */
orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
movq %rax, %rsp
- popq %rax
+
+ /*
+ * At this point, we cannot write to the stack any more, but we can
+ * still read.
+ */
+ popq %rax /* Restore user RAX */
+
+ /*
+ * RSP now points to an ordinary IRET frame, except that the page
+ * is read-only and RSP[31:16] are preloaded with the userspace
+ * values. We can now IRET back to userspace.
+ */
jmp native_irq_return_iret
#endif
END(common_interrupt)
@@ -800,6 +876,7 @@ ENTRY(native_load_gs_index)
popfq
ret
END(native_load_gs_index)
+EXPORT_SYMBOL(native_load_gs_index)
_ASM_EXTABLE(.Lgs_change, bad_gs)
.section .fixup, "ax"
@@ -1002,7 +1079,6 @@ ENTRY(error_entry)
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
-.Lerror_entry_from_usermode_swapgs:
/*
* We entered from user mode or we're pretending to have entered
* from user mode due to an IRET fault.
@@ -1045,7 +1121,8 @@ ENTRY(error_entry)
* gsbase and proceed. We'll fix up the exception and land in
* .Lgs_change's error handler with kernel gsbase.
*/
- jmp .Lerror_entry_from_usermode_swapgs
+ SWAPGS
+ jmp .Lerror_entry_done
.Lbstep_iret:
/* Fix truncated RIP */
@@ -1073,7 +1150,7 @@ END(error_entry)
/*
- * On entry, EBS is a "return to kernel mode" flag:
+ * On entry, EBX is a "return to kernel mode" flag:
* 1: already in kernel mode, don't need SWAPGS
* 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
*/
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f848572169ea..2b3618542544 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -386,3 +386,6 @@
377 i386 copy_file_range sys_copy_file_range
378 i386 preadv2 sys_preadv2 compat_sys_preadv2
379 i386 pwritev2 sys_pwritev2 compat_sys_pwritev2
+380 i386 pkey_mprotect sys_pkey_mprotect
+381 i386 pkey_alloc sys_pkey_alloc
+382 i386 pkey_free sys_pkey_free
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index e9ce9c7c39b4..e93ef0b38db8 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -335,6 +335,9 @@
326 common copy_file_range sys_copy_file_range
327 64 preadv2 sys_preadv2
328 64 pwritev2 sys_pwritev2
+329 common pkey_mprotect sys_pkey_mprotect
+330 common pkey_alloc sys_pkey_alloc
+331 common pkey_free sys_pkey_free
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index cd3d3015d7df..751d1f992630 100644
--- a/arch/x86/entry/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
@@ -10,8 +10,11 @@ syscall_macro() {
# Entry can be either just a function name or "function/qualifier"
real_entry="${entry%%/*}"
- qualifier="${entry:${#real_entry}}" # Strip the function name
- qualifier="${qualifier:1}" # Strip the slash, if any
+ if [ "$entry" = "$real_entry" ]; then
+ qualifier=
+ else
+ qualifier=${entry#*/}
+ fi
echo "__SYSCALL_${abi}($nr, $real_entry, $qualifier)"
}
@@ -22,7 +25,7 @@ emit() {
entry="$3"
compat="$4"
- if [ "$abi" == "64" -a -n "$compat" ]; then
+ if [ "$abi" = "64" -a -n "$compat" ]; then
echo "a compat entry for a 64-bit syscall makes no sense" >&2
exit 1
fi
@@ -45,17 +48,17 @@ emit() {
grep '^[0-9]' "$in" | sort -n | (
while read nr abi name entry compat; do
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
- if [ "$abi" == "COMMON" -o "$abi" == "64" ]; then
+ if [ "$abi" = "COMMON" -o "$abi" = "64" ]; then
# COMMON is the same as 64, except that we don't expect X32
# programs to use it. Our expectation has nothing to do with
# any generated code, so treat them the same.
emit 64 "$nr" "$entry" "$compat"
- elif [ "$abi" == "X32" ]; then
+ elif [ "$abi" = "X32" ]; then
# X32 is equivalent to 64 on an X32-compatible kernel.
echo "#ifdef CONFIG_X86_X32_ABI"
emit 64 "$nr" "$entry" "$compat"
echo "#endif"
- elif [ "$abi" == "I386" ]; then
+ elif [ "$abi" = "I386" ]; then
emit "$abi" "$nr" "$entry" "$compat"
else
echo "Unknown abi $abi" >&2
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index e5a17114a8c4..fee6bc79b987 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -6,6 +6,7 @@
*/
#include <linux/linkage.h>
#include <asm/asm.h>
+ #include <asm/export.h>
/* put return address in eax (arg1) */
.macro THUNK name, func, put_ret_addr_in_eax=0
@@ -36,5 +37,7 @@
#ifdef CONFIG_PREEMPT
THUNK ___preempt_schedule, preempt_schedule
THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
+ EXPORT_SYMBOL(___preempt_schedule)
+ EXPORT_SYMBOL(___preempt_schedule_notrace)
#endif
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index 627ecbcb2e62..be36bf4e0957 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -8,6 +8,7 @@
#include <linux/linkage.h>
#include "calling.h"
#include <asm/asm.h>
+#include <asm/export.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func, put_ret_addr_in_rdi=0
@@ -49,6 +50,8 @@
#ifdef CONFIG_PREEMPT
THUNK ___preempt_schedule, preempt_schedule
THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
+ EXPORT_SYMBOL(___preempt_schedule)
+ EXPORT_SYMBOL(___preempt_schedule_notrace)
#endif
#if defined(CONFIG_TRACE_IRQFLAGS) \
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 94d54d0defa7..02223cb4bcfd 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
return 0;
}
- ret = __pvclock_read_cycles(pvti);
+ ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
} while (pvclock_read_retry(pvti, version));
/* refer to vread_tsc() comment for rationale */
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 4f741192846d..3dab75f2a673 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -22,7 +22,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
- if (hdr->e_type != ET_DYN)
+ if (GET_LE(&hdr->e_type) != ET_DYN)
fail("input is not a shared object\n");
/* Walk the segment table. */
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index f840766659a8..23c881caabd1 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -37,54 +37,6 @@ void __init init_vdso_image(const struct vdso_image *image)
struct linux_binprm;
-/*
- * Put the vdso above the (randomized) stack with another randomized
- * offset. This way there is no hole in the middle of address space.
- * To save memory make sure it is still in the same PTE as the stack
- * top. This doesn't give that many random bits.
- *
- * Note that this algorithm is imperfect: the distribution of the vdso
- * start address within a PMD is biased toward the end.
- *
- * Only used for the 64-bit and x32 vdsos.
- */
-static unsigned long vdso_addr(unsigned long start, unsigned len)
-{
-#ifdef CONFIG_X86_32
- return 0;
-#else
- unsigned long addr, end;
- unsigned offset;
-
- /*
- * Round up the start address. It can start out unaligned as a result
- * of stack start randomization.
- */
- start = PAGE_ALIGN(start);
-
- /* Round the lowest possible end address up to a PMD boundary. */
- end = (start + len + PMD_SIZE - 1) & PMD_MASK;
- if (end >= TASK_SIZE_MAX)
- end = TASK_SIZE_MAX;
- end -= len;
-
- if (end > start) {
- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
- addr = start + (offset << PAGE_SHIFT);
- } else {
- addr = start;
- }
-
- /*
- * Forcibly align the final address in case we have a hardware
- * issue that requires alignment for performance reasons.
- */
- addr = align_vdso_addr(addr);
-
- return addr;
-#endif
-}
-
static int vdso_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
@@ -176,30 +128,28 @@ static int vvar_fault(const struct vm_special_mapping *sm,
return VM_FAULT_SIGBUS;
}
-static int map_vdso(const struct vdso_image *image, bool calculate_addr)
+static const struct vm_special_mapping vdso_mapping = {
+ .name = "[vdso]",
+ .fault = vdso_fault,
+ .mremap = vdso_mremap,
+};
+static const struct vm_special_mapping vvar_mapping = {
+ .name = "[vvar]",
+ .fault = vvar_fault,
+};
+
+/*
+ * Add vdso and vvar mappings to current process.
+ * @image - blob to map
+ * @addr - request a specific address (zero to map at free addr)
+ */
+static int map_vdso(const struct vdso_image *image, unsigned long addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long addr, text_start;
+ unsigned long text_start;
int ret = 0;
- static const struct vm_special_mapping vdso_mapping = {
- .name = "[vdso]",
- .fault = vdso_fault,
- .mremap = vdso_mremap,
- };
- static const struct vm_special_mapping vvar_mapping = {
- .name = "[vvar]",
- .fault = vvar_fault,
- };
-
- if (calculate_addr) {
- addr = vdso_addr(current->mm->start_stack,
- image->size - image->sym_vvar_start);
- } else {
- addr = 0;
- }
-
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
@@ -238,24 +188,104 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
- goto up_fail;
+ do_munmap(mm, text_start, image->size);
}
up_fail:
- if (ret)
+ if (ret) {
current->mm->context.vdso = NULL;
+ current->mm->context.vdso_image = NULL;
+ }
up_write(&mm->mmap_sem);
return ret;
}
+#ifdef CONFIG_X86_64
+/*
+ * Put the vdso above the (randomized) stack with another randomized
+ * offset. This way there is no hole in the middle of address space.
+ * To save memory make sure it is still in the same PTE as the stack
+ * top. This doesn't give that many random bits.
+ *
+ * Note that this algorithm is imperfect: the distribution of the vdso
+ * start address within a PMD is biased toward the end.
+ *
+ * Only used for the 64-bit and x32 vdsos.
+ */
+static unsigned long vdso_addr(unsigned long start, unsigned len)
+{
+ unsigned long addr, end;
+ unsigned offset;
+
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
+ if (end >= TASK_SIZE_MAX)
+ end = TASK_SIZE_MAX;
+ end -= len;
+
+ if (end > start) {
+ offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
+
+ /*
+ * Forcibly align the final address in case we have a hardware
+ * issue that requires alignment for performance reasons.
+ */
+ addr = align_vdso_addr(addr);
+
+ return addr;
+}
+
+static int map_vdso_randomized(const struct vdso_image *image)
+{
+ unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start);
+
+ return map_vdso(image, addr);
+}
+#endif
+
+int map_vdso_once(const struct vdso_image *image, unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ /*
+ * Check if we have already mapped vdso blob - fail to prevent
+ * abusing from userspace install_speciall_mapping, which may
+ * not do accounting and rlimit right.
+ * We could search vma near context.vdso, but it's a slowpath,
+ * so let's explicitely check all VMAs to be completely sure.
+ */
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (vma_is_special_mapping(vma, &vdso_mapping) ||
+ vma_is_special_mapping(vma, &vvar_mapping)) {
+ up_write(&mm->mmap_sem);
+ return -EEXIST;
+ }
+ }
+ up_write(&mm->mmap_sem);
+
+ return map_vdso(image, addr);
+}
+
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static int load_vdso32(void)
{
if (vdso32_enabled != 1) /* Other values all mean "disabled" */
return 0;
- return map_vdso(&vdso_image_32, false);
+ return map_vdso(&vdso_image_32, 0);
}
#endif
@@ -265,7 +295,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
if (!vdso64_enabled)
return 0;
- return map_vdso(&vdso_image_64, true);
+ return map_vdso_randomized(&vdso_image_64);
}
#ifdef CONFIG_COMPAT
@@ -276,8 +306,7 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
if (test_thread_flag(TIF_X32)) {
if (!vdso64_enabled)
return 0;
-
- return map_vdso(&vdso_image_x32, true);
+ return map_vdso_randomized(&vdso_image_x32);
}
#endif
#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index d0efb5cb1b00..d31735f37ed7 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -37,6 +37,7 @@
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
+#include <asm/unwind.h>
#include "perf_event.h"
@@ -1201,6 +1202,9 @@ static int x86_pmu_add(struct perf_event *event, int flags)
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be performed
* at commit time (->commit_txn) as a whole.
+ *
+ * If commit fails, we'll call ->del() on all events
+ * for which ->add() was called.
*/
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto done_collect;
@@ -1223,6 +1227,14 @@ done_collect:
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
+ if (x86_pmu.add) {
+ /*
+ * This is before x86_pmu_enable() will call x86_pmu_start(),
+ * so we enable LBRs before an event needs them etc..
+ */
+ x86_pmu.add(event);
+ }
+
ret = 0;
out:
return ret;
@@ -1346,7 +1358,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
event->hw.flags &= ~PERF_X86_EVENT_COMMITTED;
/*
- * If we're called during a txn, we don't need to do anything.
+ * If we're called during a txn, we only need to undo x86_pmu.add.
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
*
@@ -1354,7 +1366,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
* an event added during that same TXN.
*/
if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
- return;
+ goto do_del;
/*
* Not a TXN, therefore cleanup properly.
@@ -1384,6 +1396,15 @@ static void x86_pmu_del(struct perf_event *event, int flags)
--cpuc->n_events;
perf_event_update_userpage(event);
+
+do_del:
+ if (x86_pmu.del) {
+ /*
+ * This is after x86_pmu_stop(); so we disable LBRs after any
+ * event can need them etc..
+ */
+ x86_pmu.del(event);
+ }
}
int x86_pmu_handle_irq(struct pt_regs *regs)
@@ -2247,39 +2268,26 @@ void arch_perf_update_userpage(struct perf_event *event,
cyc2ns_read_end(data);
}
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
- return 0;
-}
-
-static int backtrace_address(void *data, unsigned long addr, int reliable)
-{
- struct perf_callchain_entry_ctx *entry = data;
-
- return perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
- .stack = backtrace_stack,
- .address = backtrace_address,
- .walk_stack = print_context_stack_bp,
-};
-
void
perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
{
+ struct unwind_state state;
+ unsigned long addr;
+
if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
/* TODO: We don't support guest os callchain now */
return;
}
- perf_callchain_store(entry, regs->ip);
+ if (perf_callchain_store(entry, regs->ip))
+ return;
- dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+ for (unwind_start(&state, current, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || perf_callchain_store(entry, addr))
+ return;
+ }
}
static inline int
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 4c9a79b9cd69..eab0915f5995 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1906,13 +1906,6 @@ static void intel_pmu_disable_event(struct perf_event *event)
cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
cpuc->intel_cp_status &= ~(1ull << hwc->idx);
- /*
- * must disable before any actual event
- * because any event may be combined with LBR
- */
- if (needs_branch_stack(event))
- intel_pmu_lbr_disable(event);
-
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc);
return;
@@ -1924,6 +1917,14 @@ static void intel_pmu_disable_event(struct perf_event *event)
intel_pmu_pebs_disable(event);
}
+static void intel_pmu_del_event(struct perf_event *event)
+{
+ if (needs_branch_stack(event))
+ intel_pmu_lbr_del(event);
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_del(event);
+}
+
static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
{
int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
@@ -1967,12 +1968,6 @@ static void intel_pmu_enable_event(struct perf_event *event)
intel_pmu_enable_bts(hwc->config);
return;
}
- /*
- * must enabled before any actual event
- * because any event may be combined with LBR
- */
- if (needs_branch_stack(event))
- intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
@@ -1993,6 +1988,14 @@ static void intel_pmu_enable_event(struct perf_event *event)
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
+static void intel_pmu_add_event(struct perf_event *event)
+{
+ if (event->attr.precise_ip)
+ intel_pmu_pebs_add(event);
+ if (needs_branch_stack(event))
+ intel_pmu_lbr_add(event);
+}
+
/*
* Save and restart an expired event. Called by NMI contexts,
* so it has to be careful about preempting normal event ops:
@@ -3291,6 +3294,8 @@ static __initconst const struct x86_pmu intel_pmu = {
.enable_all = intel_pmu_enable_all,
.enable = intel_pmu_enable_event,
.disable = intel_pmu_disable_event,
+ .add = intel_pmu_add_event,
+ .del = intel_pmu_del_event,
.hw_config = intel_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
@@ -3893,6 +3898,7 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_XEON_PHI_KNL:
+ case INTEL_FAM6_XEON_PHI_KNM:
memcpy(hw_cache_event_ids,
slm_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs,
@@ -3907,7 +3913,7 @@ __init int intel_pmu_init(void)
x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
- pr_cont("Knights Landing events, ");
+ pr_cont("Knights Landing/Mill events, ");
break;
case INTEL_FAM6_SKYLAKE_MOBILE:
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 9b983a474253..0319311dbdbb 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -806,9 +806,65 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
return &emptyconstraint;
}
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
+{
+ return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+ struct debug_store *ds = cpuc->ds;
+ u64 threshold;
+
+ if (cpuc->n_pebs == cpuc->n_large_pebs) {
+ threshold = ds->pebs_absolute_maximum -
+ x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+ } else {
+ threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+ }
+
+ ds->pebs_interrupt_threshold = threshold;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
+{
+ /*
+ * Make sure we get updated with the first PEBS
+ * event. It will trigger also during removal, but
+ * that does not hurt:
+ */
+ bool update = cpuc->n_pebs == 1;
+
+ if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+ if (!needed_cb)
+ perf_sched_cb_inc(pmu);
+ else
+ perf_sched_cb_dec(pmu);
+
+ update = true;
+ }
+
+ if (update)
+ pebs_update_threshold(cpuc);
+}
+
+void intel_pmu_pebs_add(struct perf_event *event)
{
- return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+ cpuc->n_pebs++;
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+ cpuc->n_large_pebs++;
+
+ pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
}
void intel_pmu_pebs_enable(struct perf_event *event)
@@ -816,12 +872,9 @@ void intel_pmu_pebs_enable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
- bool first_pebs;
- u64 threshold;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
- first_pebs = !pebs_is_enabled(cpuc);
cpuc->pebs_enabled |= 1ULL << hwc->idx;
if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
@@ -830,46 +883,34 @@ void intel_pmu_pebs_enable(struct perf_event *event)
cpuc->pebs_enabled |= 1ULL << 63;
/*
- * When the event is constrained enough we can use a larger
- * threshold and run the event with less frequent PMI.
+ * Use auto-reload if possible to save a MSR write in the PMI.
+ * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
*/
- if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
- threshold = ds->pebs_absolute_maximum -
- x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
- if (first_pebs)
- perf_sched_cb_inc(event->ctx->pmu);
- } else {
- threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
- /*
- * If not all events can use larger buffer,
- * roll back to threshold = 1
- */
- if (!first_pebs &&
- (ds->pebs_interrupt_threshold > threshold))
- perf_sched_cb_dec(event->ctx->pmu);
- }
-
- /* Use auto-reload if possible to save a MSR write in the PMI */
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
ds->pebs_event_reset[hwc->idx] =
(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
}
+}
+
+void intel_pmu_pebs_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ bool needed_cb = pebs_needs_sched_cb(cpuc);
- if (first_pebs || ds->pebs_interrupt_threshold > threshold)
- ds->pebs_interrupt_threshold = threshold;
+ cpuc->n_pebs--;
+ if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+ cpuc->n_large_pebs--;
+
+ pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
}
void intel_pmu_pebs_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- struct debug_store *ds = cpuc->ds;
- bool large_pebs = ds->pebs_interrupt_threshold >
- ds->pebs_buffer_base + x86_pmu.pebs_record_size;
- if (large_pebs)
+ if (cpuc->n_pebs == cpuc->n_large_pebs)
intel_pmu_drain_pebs_buffer();
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -879,9 +920,6 @@ void intel_pmu_pebs_disable(struct perf_event *event)
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled &= ~(1ULL << 63);
- if (large_pebs && !pebs_is_enabled(cpuc))
- perf_sched_cb_dec(event->ctx->pmu);
-
if (cpuc->enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 707d358e0dff..81b321ace8e0 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -380,7 +380,6 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
{
- struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
/*
@@ -390,31 +389,21 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
*/
task_ctx = ctx ? ctx->task_ctx_data : NULL;
if (task_ctx) {
- if (sched_in) {
+ if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
- cpuc->lbr_context = ctx;
- } else {
+ else
__intel_pmu_lbr_save(task_ctx);
- }
return;
}
/*
- * When sampling the branck stack in system-wide, it may be
- * necessary to flush the stack on context switch. This happens
- * when the branch stack does not tag its entries with the pid
- * of the current task. Otherwise it becomes impossible to
- * associate a branch entry with a task. This ambiguity is more
- * likely to appear when the branch stack supports priv level
- * filtering and the user sets it to monitor only at the user
- * level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch
- * stack with branch from multiple tasks.
- */
- if (sched_in) {
+ * Since a context switch can flip the address space and LBR entries
+ * are not tagged with an identifier, we need to wipe the LBR, even for
+ * per-cpu events. You simply cannot resolve the branches from the old
+ * address space.
+ */
+ if (sched_in)
intel_pmu_lbr_reset();
- cpuc->lbr_context = ctx;
- }
}
static inline bool branch_user_callstack(unsigned br_sel)
@@ -422,7 +411,7 @@ static inline bool branch_user_callstack(unsigned br_sel)
return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
}
-void intel_pmu_lbr_enable(struct perf_event *event)
+void intel_pmu_lbr_add(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
@@ -430,27 +419,38 @@ void intel_pmu_lbr_enable(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;
- /*
- * Reset the LBR stack if we changed task context to
- * avoid data leaks.
- */
- if (event->ctx->task && cpuc->lbr_context != event->ctx) {
- intel_pmu_lbr_reset();
- cpuc->lbr_context = event->ctx;
- }
cpuc->br_sel = event->hw.branch_reg.reg;
- if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
- event->ctx->task_ctx_data) {
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
task_ctx = event->ctx->task_ctx_data;
task_ctx->lbr_callstack_users++;
}
- cpuc->lbr_users++;
+ /*
+ * Request pmu::sched_task() callback, which will fire inside the
+ * regular perf event scheduling, so that call will:
+ *
+ * - restore or wipe; when LBR-callstack,
+ * - wipe; otherwise,
+ *
+ * when this is from __perf_event_task_sched_in().
+ *
+ * However, if this is from perf_install_in_context(), no such callback
+ * will follow and we'll need to reset the LBR here if this is the
+ * first LBR event.
+ *
+ * The problem is, we cannot tell these cases apart... but we can
+ * exclude the biggest chunk of cases by looking at
+ * event->total_time_running. An event that has accrued runtime cannot
+ * be 'new'. Conversely, a new event can get installed through the
+ * context switch path for the first time.
+ */
perf_sched_cb_inc(event->ctx->pmu);
+ if (!cpuc->lbr_users++ && !event->total_time_running)
+ intel_pmu_lbr_reset();
}
-void intel_pmu_lbr_disable(struct perf_event *event)
+void intel_pmu_lbr_del(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx;
@@ -458,8 +458,8 @@ void intel_pmu_lbr_disable(struct perf_event *event)
if (!x86_pmu.lbr_nr)
return;
- if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
- event->ctx->task_ctx_data) {
+ if (branch_user_callstack(cpuc->br_sel) &&
+ event->ctx->task_ctx_data) {
task_ctx = event->ctx->task_ctx_data;
task_ctx->lbr_callstack_users--;
}
@@ -467,12 +467,6 @@ void intel_pmu_lbr_disable(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
-
- if (cpuc->enabled && !cpuc->lbr_users) {
- __intel_pmu_lbr_disable();
- /* avoid stale pointer */
- cpuc->lbr_context = NULL;
- }
}
void intel_pmu_lbr_enable_all(bool pmi)
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 861a7d9cb60f..c5047b8f777b 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -69,6 +69,8 @@ static struct pt_cap_desc {
PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)),
PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)),
PT_CAP(mtc, 0, CR_EBX, BIT(3)),
+ PT_CAP(ptwrite, 0, CR_EBX, BIT(4)),
+ PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)),
PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
PT_CAP(single_range_output, 0, CR_ECX, BIT(2)),
@@ -259,10 +261,16 @@ fail:
#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \
RTIT_CTL_MTC_RANGE)
+#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \
+ RTIT_CTL_FUP_ON_PTW)
+
#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \
RTIT_CTL_DISRETC | \
RTIT_CTL_CYC_PSB | \
- RTIT_CTL_MTC)
+ RTIT_CTL_MTC | \
+ RTIT_CTL_PWR_EVT_EN | \
+ RTIT_CTL_FUP_ON_PTW | \
+ RTIT_CTL_PTW_EN)
static bool pt_event_valid(struct perf_event *event)
{
@@ -311,6 +319,20 @@ static bool pt_event_valid(struct perf_event *event)
return false;
}
+ if (config & RTIT_CTL_PWR_EVT_EN &&
+ !pt_cap_get(PT_CAP_power_event_trace))
+ return false;
+
+ if (config & RTIT_CTL_PTW) {
+ if (!pt_cap_get(PT_CAP_ptwrite))
+ return false;
+
+ /* FUPonPTW without PTW doesn't make sense */
+ if ((config & RTIT_CTL_FUP_ON_PTW) &&
+ !(config & RTIT_CTL_PTW_EN))
+ return false;
+ }
+
return true;
}
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index efffa4a09f68..53473c21b554 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -26,11 +26,14 @@
#define RTIT_CTL_CYCLEACC BIT(1)
#define RTIT_CTL_OS BIT(2)
#define RTIT_CTL_USR BIT(3)
+#define RTIT_CTL_PWR_EVT_EN BIT(4)
+#define RTIT_CTL_FUP_ON_PTW BIT(5)
#define RTIT_CTL_CR3EN BIT(7)
#define RTIT_CTL_TOPA BIT(8)
#define RTIT_CTL_MTC_EN BIT(9)
#define RTIT_CTL_TSC_EN BIT(10)
#define RTIT_CTL_DISRETC BIT(11)
+#define RTIT_CTL_PTW_EN BIT(12)
#define RTIT_CTL_BRANCH_EN BIT(13)
#define RTIT_CTL_MTC_RANGE_OFFSET 14
#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
@@ -91,6 +94,8 @@ enum pt_capabilities {
PT_CAP_psb_cyc,
PT_CAP_ip_filtering,
PT_CAP_mtc,
+ PT_CAP_ptwrite,
+ PT_CAP_power_event_trace,
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
PT_CAP_single_range_output,
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 28865938aadf..0a535cea8ff3 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -357,6 +357,8 @@ static int rapl_pmu_event_init(struct perf_event *event)
if (event->cpu < 0)
return -EINVAL;
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
/*
* check event is known (determines counter)
*/
@@ -761,10 +763,13 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = {
X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init),
+
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init),
{},
};
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 463dc7a5a6c3..efca2685d876 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -664,6 +664,8 @@ static int uncore_pmu_event_init(struct perf_event *event)
event->cpu = box->cpu;
event->pmu_private = box;
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
event->hw.idx = -1;
event->hw.last_tag = ~0ULL;
event->hw.extra_reg.idx = EXTRA_REG_NONE;
@@ -683,7 +685,8 @@ static int uncore_pmu_event_init(struct perf_event *event)
/* fixed counters have event field hardcoded to zero */
hwc->config = 0ULL;
} else {
- hwc->config = event->attr.config & pmu->type->event_mask;
+ hwc->config = event->attr.config &
+ (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32));
if (pmu->type->ops->hw_config) {
ret = pmu->type->ops->hw_config(box, event);
if (ret)
@@ -1321,6 +1324,11 @@ static const struct intel_uncore_init_fun skl_uncore_init __initconst = {
.pci_init = skl_uncore_pci_init,
};
+static const struct intel_uncore_init_fun skx_uncore_init __initconst = {
+ .cpu_init = skx_uncore_cpu_init,
+ .pci_init = skx_uncore_pci_init,
+};
+
static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init),
@@ -1341,8 +1349,10 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, bdx_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, bdx_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init),
X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init),
+ X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init),
{},
};
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 78b9c23e2d8d..ad986c1e29bc 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -44,6 +44,7 @@ struct intel_uncore_type {
unsigned perf_ctr;
unsigned event_ctl;
unsigned event_mask;
+ unsigned event_mask_ext;
unsigned fixed_ctr;
unsigned fixed_ctl;
unsigned box_ctl;
@@ -120,6 +121,7 @@ struct intel_uncore_box {
};
#define UNCORE_BOX_FLAG_INITIATED 0
+#define UNCORE_BOX_FLAG_CTL_OFFS8 1 /* event config registers are 8-byte apart */
struct uncore_event_desc {
struct kobj_attribute attr;
@@ -172,6 +174,9 @@ static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
static inline
unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
{
+ if (test_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags))
+ return idx * 8 + box->pmu->type->event_ctl;
+
return idx * 4 + box->pmu->type->event_ctl;
}
@@ -377,6 +382,8 @@ int bdx_uncore_pci_init(void);
void bdx_uncore_cpu_init(void);
int knl_uncore_pci_init(void);
void knl_uncore_cpu_init(void);
+int skx_uncore_pci_init(void);
+void skx_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 9d35ec0cb8fc..5f845eef9a4d 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -388,6 +388,8 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
event->cpu = box->cpu;
event->pmu_private = box;
+ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+
event->hw.idx = -1;
event->hw.last_tag = ~0ULL;
event->hw.extra_reg.idx = EXTRA_REG_NONE;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 8aee83bcf71f..272427700d48 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1,6 +1,10 @@
/* SandyBridge-EP/IvyTown uncore support */
#include "uncore.h"
+/* SNB-EP pci bus to socket mapping */
+#define SNBEP_CPUNODEID 0x40
+#define SNBEP_GIDNIDMAP 0x54
+
/* SNB-EP Box level control */
#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
@@ -264,15 +268,72 @@
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* SKX pci bus to socket mapping */
+#define SKX_CPUNODEID 0xc0
+#define SKX_GIDNIDMAP 0xd4
+
+/* SKX CHA */
+#define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_STATE (0x3ffULL << 17)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_REM (0x1ULL << 32)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_LOC (0x1ULL << 33)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC (0x1ULL << 35)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NM (0x1ULL << 36)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM (0x1ULL << 37)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC0 (0x3ffULL << 41)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC1 (0x3ffULL << 51)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define SKX_CHA_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+/* SKX IIO */
+#define SKX_IIO0_MSR_PMON_CTL0 0xa48
+#define SKX_IIO0_MSR_PMON_CTR0 0xa41
+#define SKX_IIO0_MSR_PMON_BOX_CTL 0xa40
+#define SKX_IIO_MSR_OFFSET 0x20
+
+#define SKX_PMON_CTL_TRESH_MASK (0xff << 24)
+#define SKX_PMON_CTL_TRESH_MASK_EXT (0xf)
+#define SKX_PMON_CTL_CH_MASK (0xff << 4)
+#define SKX_PMON_CTL_FC_MASK (0x7 << 12)
+#define SKX_IIO_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SKX_PMON_CTL_TRESH_MASK)
+#define SKX_IIO_PMON_RAW_EVENT_MASK_EXT (SKX_PMON_CTL_TRESH_MASK_EXT | \
+ SKX_PMON_CTL_CH_MASK | \
+ SKX_PMON_CTL_FC_MASK)
+
+/* SKX IRP */
+#define SKX_IRP0_MSR_PMON_CTL0 0xa5b
+#define SKX_IRP0_MSR_PMON_CTR0 0xa59
+#define SKX_IRP0_MSR_PMON_BOX_CTL 0xa58
+#define SKX_IRP_MSR_OFFSET 0x20
+
+/* SKX UPI */
+#define SKX_UPI_PCI_PMON_CTL0 0x350
+#define SKX_UPI_PCI_PMON_CTR0 0x318
+#define SKX_UPI_PCI_PMON_BOX_CTL 0x378
+#define SKX_PMON_CTL_UMASK_EXT 0xff
+
+/* SKX M2M */
+#define SKX_M2M_PCI_PMON_CTL0 0x228
+#define SKX_M2M_PCI_PMON_CTR0 0x200
+#define SKX_M2M_PCI_PMON_BOX_CTL 0x258
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6");
DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39");
DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35");
DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29");
DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
@@ -280,6 +341,8 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31");
+DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43");
+DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
@@ -288,18 +351,26 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state5, filter_state, "config1:17-26");
+DEFINE_UNCORE_FORMAT_ATTR(filter_rem, filter_rem, "config1:32");
+DEFINE_UNCORE_FORMAT_ATTR(filter_loc, filter_loc, "config1:33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nm, filter_nm, "config1:36");
+DEFINE_UNCORE_FORMAT_ATTR(filter_not_nm, filter_not_nm, "config1:37");
DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33");
DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35");
DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_0, filter_opc0, "config1:41-50");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc_1, filter_opc1, "config1:51-60");
DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
@@ -1153,7 +1224,7 @@ static struct pci_driver snbep_uncore_pci_driver = {
/*
* build pci bus to socket mapping
*/
-static int snbep_pci2phy_map_init(int devid)
+static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse)
{
struct pci_dev *ubox_dev = NULL;
int i, bus, nodeid, segment;
@@ -1168,12 +1239,12 @@ static int snbep_pci2phy_map_init(int devid)
break;
bus = ubox_dev->bus->number;
/* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, 0x40, &config);
+ err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
if (err)
break;
nodeid = config;
/* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, 0x54, &config);
+ err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
if (err)
break;
@@ -1207,11 +1278,20 @@ static int snbep_pci2phy_map_init(int devid)
raw_spin_lock(&pci2phy_map_lock);
list_for_each_entry(map, &pci2phy_map_head, list) {
i = -1;
- for (bus = 255; bus >= 0; bus--) {
- if (map->pbus_to_physid[bus] >= 0)
- i = map->pbus_to_physid[bus];
- else
- map->pbus_to_physid[bus] = i;
+ if (reverse) {
+ for (bus = 255; bus >= 0; bus--) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
+ } else {
+ for (bus = 0; bus <= 255; bus++) {
+ if (map->pbus_to_physid[bus] >= 0)
+ i = map->pbus_to_physid[bus];
+ else
+ map->pbus_to_physid[bus] = i;
+ }
}
}
raw_spin_unlock(&pci2phy_map_lock);
@@ -1224,7 +1304,7 @@ static int snbep_pci2phy_map_init(int devid)
int snbep_uncore_pci_init(void)
{
- int ret = snbep_pci2phy_map_init(0x3ce0);
+ int ret = snbep_pci2phy_map_init(0x3ce0, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
if (ret)
return ret;
uncore_pci_uncores = snbep_pci_uncores;
@@ -1788,7 +1868,7 @@ static struct pci_driver ivbep_uncore_pci_driver = {
int ivbep_uncore_pci_init(void)
{
- int ret = snbep_pci2phy_map_init(0x0e1e);
+ int ret = snbep_pci2phy_map_init(0x0e1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
if (ret)
return ret;
uncore_pci_uncores = ivbep_pci_uncores;
@@ -2897,7 +2977,7 @@ static struct pci_driver hswep_uncore_pci_driver = {
int hswep_uncore_pci_init(void)
{
- int ret = snbep_pci2phy_map_init(0x2f1e);
+ int ret = snbep_pci2phy_map_init(0x2f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
if (ret)
return ret;
uncore_pci_uncores = hswep_pci_uncores;
@@ -3186,7 +3266,7 @@ static struct pci_driver bdx_uncore_pci_driver = {
int bdx_uncore_pci_init(void)
{
- int ret = snbep_pci2phy_map_init(0x6f1e);
+ int ret = snbep_pci2phy_map_init(0x6f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true);
if (ret)
return ret;
@@ -3196,3 +3276,525 @@ int bdx_uncore_pci_init(void)
}
/* end of BDX uncore support */
+
+/* SKX uncore support */
+
+static struct intel_uncore_type skx_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct attribute *skx_uncore_cha_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid4.attr,
+ &format_attr_filter_link4.attr,
+ &format_attr_filter_state5.attr,
+ &format_attr_filter_rem.attr,
+ &format_attr_filter_loc.attr,
+ &format_attr_filter_nm.attr,
+ &format_attr_filter_all_op.attr,
+ &format_attr_filter_not_nm.attr,
+ &format_attr_filter_opc_0.attr,
+ &format_attr_filter_opc_1.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_c6.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_chabox_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_cha_formats_attr,
+};
+
+static struct event_constraint skx_uncore_chabox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg skx_uncore_cha_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4),
+};
+
+static u64 skx_cha_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE;
+ return mask;
+}
+
+static struct event_constraint *
+skx_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, skx_cha_filter_mask);
+}
+
+static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = skx_uncore_cha_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & skx_cha_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops skx_uncore_chabox_ops = {
+ /* There is no frz_en for chabox ctl */
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = skx_cha_hw_config,
+ .get_constraint = skx_cha_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_chabox = {
+ .name = "cha",
+ .num_counters = 4,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = skx_uncore_chabox_constraints,
+ .ops = &skx_uncore_chabox_ops,
+ .format_group = &skx_uncore_chabox_format_group,
+};
+
+static struct attribute *skx_uncore_iio_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh9.attr,
+ &format_attr_ch_mask.attr,
+ &format_attr_fc_mask.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_iio_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_iio_formats_attr,
+};
+
+static struct event_constraint skx_uncore_iio_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x88, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x95, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc0, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xc5, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0xd4, 0xc),
+ EVENT_CONSTRAINT_END
+};
+
+static void skx_iio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops skx_uncore_iio_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = skx_iio_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_iio = {
+ .name = "iio",
+ .num_counters = 4,
+ .num_boxes = 5,
+ .perf_ctr_bits = 48,
+ .event_ctl = SKX_IIO0_MSR_PMON_CTL0,
+ .perf_ctr = SKX_IIO0_MSR_PMON_CTR0,
+ .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT,
+ .box_ctl = SKX_IIO0_MSR_PMON_BOX_CTL,
+ .msr_offset = SKX_IIO_MSR_OFFSET,
+ .constraints = skx_uncore_iio_constraints,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &skx_uncore_iio_format_group,
+};
+
+static struct attribute *skx_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group skx_uncore_format_group = {
+ .name = "format",
+ .attrs = skx_uncore_formats_attr,
+};
+
+static struct intel_uncore_type skx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 2,
+ .num_boxes = 5,
+ .perf_ctr_bits = 48,
+ .event_ctl = SKX_IRP0_MSR_PMON_CTL0,
+ .perf_ctr = SKX_IRP0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SKX_IRP0_MSR_PMON_BOX_CTL,
+ .msr_offset = SKX_IRP_MSR_OFFSET,
+ .ops = &skx_uncore_iio_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct intel_uncore_ops skx_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type skx_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &skx_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *skx_msr_uncores[] = {
+ &skx_uncore_ubox,
+ &skx_uncore_chabox,
+ &skx_uncore_iio,
+ &skx_uncore_irp,
+ &skx_uncore_pcu,
+ NULL,
+};
+
+static int skx_count_chabox(void)
+{
+ struct pci_dev *chabox_dev = NULL;
+ int bus, count = 0;
+
+ while (1) {
+ chabox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x208d, chabox_dev);
+ if (!chabox_dev)
+ break;
+ if (count == 0)
+ bus = chabox_dev->bus->number;
+ if (bus != chabox_dev->bus->number)
+ break;
+ count++;
+ }
+
+ pci_dev_put(chabox_dev);
+ return count;
+}
+
+void skx_uncore_cpu_init(void)
+{
+ skx_uncore_chabox.num_boxes = skx_count_chabox();
+ uncore_msr_uncores = skx_msr_uncores;
+}
+
+static struct intel_uncore_type skx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 6,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct attribute *skx_upi_uncore_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask_ext.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group skx_upi_uncore_format_group = {
+ .name = "format",
+ .attrs = skx_upi_uncore_formats_attr,
+};
+
+static void skx_upi_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, SKX_UPI_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_upi_uncore_pci_ops = {
+ .init_box = skx_upi_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_upi = {
+ .name = "upi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SKX_UPI_PCI_PMON_CTR0,
+ .event_ctl = SKX_UPI_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .event_mask_ext = SKX_PMON_CTL_UMASK_EXT,
+ .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL,
+ .ops = &skx_upi_uncore_pci_ops,
+ .format_group = &skx_upi_uncore_format_group,
+};
+
+static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, SKX_M2M_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static struct intel_uncore_ops skx_m2m_uncore_pci_ops = {
+ .init_box = skx_m2m_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+static struct intel_uncore_type skx_uncore_m2m = {
+ .name = "m2m",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SKX_M2M_PCI_PMON_CTR0,
+ .event_ctl = SKX_M2M_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SKX_M2M_PCI_PMON_BOX_CTL,
+ .ops = &skx_m2m_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m2pcie = {
+ .name = "m2pcie",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .constraints = skx_uncore_m2pcie_constraints,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+static struct event_constraint skx_uncore_m3upi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x40, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x4f, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x50, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x51, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x52, 0x7),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type skx_uncore_m3upi = {
+ .name = "m3upi",
+ .num_counters = 3,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .constraints = skx_uncore_m3upi_constraints,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_pci_ops,
+ .format_group = &skx_uncore_format_group,
+};
+
+enum {
+ SKX_PCI_UNCORE_IMC,
+ SKX_PCI_UNCORE_M2M,
+ SKX_PCI_UNCORE_UPI,
+ SKX_PCI_UNCORE_M2PCIE,
+ SKX_PCI_UNCORE_M3UPI,
+};
+
+static struct intel_uncore_type *skx_pci_uncores[] = {
+ [SKX_PCI_UNCORE_IMC] = &skx_uncore_imc,
+ [SKX_PCI_UNCORE_M2M] = &skx_uncore_m2m,
+ [SKX_PCI_UNCORE_UPI] = &skx_uncore_upi,
+ [SKX_PCI_UNCORE_M2PCIE] = &skx_uncore_m2pcie,
+ [SKX_PCI_UNCORE_M3UPI] = &skx_uncore_m3upi,
+ NULL,
+};
+
+static const struct pci_device_id skx_uncore_pci_ids[] = {
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 2, SKX_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 6, SKX_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 2, SKX_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 2, SKX_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 6, SKX_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 2, SKX_PCI_UNCORE_IMC, 5),
+ },
+ { /* M2M0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 0, SKX_PCI_UNCORE_M2M, 0),
+ },
+ { /* M2M1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 0, SKX_PCI_UNCORE_M2M, 1),
+ },
+ { /* UPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, SKX_PCI_UNCORE_UPI, 0),
+ },
+ { /* UPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, SKX_PCI_UNCORE_UPI, 1),
+ },
+ { /* UPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, SKX_PCI_UNCORE_UPI, 2),
+ },
+ { /* M2PCIe 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 1, SKX_PCI_UNCORE_M2PCIE, 0),
+ },
+ { /* M2PCIe 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 1, SKX_PCI_UNCORE_M2PCIE, 1),
+ },
+ { /* M2PCIe 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(23, 1, SKX_PCI_UNCORE_M2PCIE, 2),
+ },
+ { /* M2PCIe 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3),
+ },
+ { /* M3UPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0),
+ },
+ { /* M3UPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1),
+ },
+ { /* M3UPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C),
+ .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2),
+ },
+ { /* end: all zeroes */ }
+};
+
+
+static struct pci_driver skx_uncore_pci_driver = {
+ .name = "skx_uncore",
+ .id_table = skx_uncore_pci_ids,
+};
+
+int skx_uncore_pci_init(void)
+{
+ /* need to double check pci address */
+ int ret = snbep_pci2phy_map_init(0x2014, SKX_CPUNODEID, SKX_GIDNIDMAP, false);
+
+ if (ret)
+ return ret;
+
+ uncore_pci_uncores = skx_pci_uncores;
+ uncore_pci_driver = &skx_uncore_pci_driver;
+ return 0;
+}
+
+/* end of SKX uncore support */
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 8c4a47706296..5874d8de1f8d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -194,12 +194,13 @@ struct cpu_hw_events {
*/
struct debug_store *ds;
u64 pebs_enabled;
+ int n_pebs;
+ int n_large_pebs;
/*
* Intel LBR bits
*/
int lbr_users;
- void *lbr_context;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
struct er_account *lbr_sel;
@@ -508,6 +509,8 @@ struct x86_pmu {
void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ void (*add)(struct perf_event *);
+ void (*del)(struct perf_event *);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
@@ -888,6 +891,10 @@ extern struct event_constraint intel_skl_pebs_event_constraints[];
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
+void intel_pmu_pebs_add(struct perf_event *event);
+
+void intel_pmu_pebs_del(struct perf_event *event);
+
void intel_pmu_pebs_enable(struct perf_event *event);
void intel_pmu_pebs_disable(struct perf_event *event);
@@ -906,9 +913,9 @@ u64 lbr_from_signext_quirk_wr(u64 val);
void intel_pmu_lbr_reset(void);
-void intel_pmu_lbr_enable(struct perf_event *event);
+void intel_pmu_lbr_add(struct perf_event *event);
-void intel_pmu_lbr_disable(struct perf_event *event);
+void intel_pmu_lbr_del(struct perf_event *event);
void intel_pmu_lbr_enable_all(bool pmi);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 2f29f4e407c3..cb13c0564ea7 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -378,7 +378,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
} put_user_catch(err);
- err |= copy_siginfo_to_user32(&frame->info, &ksig->info);
+ err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index e77a6443104f..1b020381ab38 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -217,10 +217,14 @@ static inline int alternatives_text_reserved(void *start, void *end)
*/
#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
output, input...) \
+{ \
+ register void *__sp asm(_ASM_SP); \
asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
"call %P[new2]", feature2) \
- : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
- [new2] "i" (newfunc2), ## input)
+ : output, "+r" (__sp) \
+ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \
+ [new2] "i" (newfunc2), ## input); \
+}
/*
* use this macro(s) if you need more than one output parameter
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 124357773ffa..f5aaf6c83222 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -650,8 +650,8 @@ static inline void entering_ack_irq(void)
static inline void ipi_entering_ack_irq(void)
{
- ack_APIC_irq();
irq_enter();
+ ack_APIC_irq();
}
static inline void exiting_irq(void)
@@ -661,9 +661,8 @@ static inline void exiting_irq(void)
static inline void exiting_ack_irq(void)
{
- irq_exit();
- /* Ack only at the end to avoid potential reentry */
ack_APIC_irq();
+ irq_exit();
}
extern void ioapic_zap_locks(void);
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 61518cf79437..872877d930de 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,7 +4,6 @@
/* Caches aren't brain-dead on the intel. */
#include <asm-generic/cacheflush.h>
#include <asm/special_insns.h>
-#include <asm/uaccess.h>
/*
* The set_memory_* API can be used to change various attributes of a virtual
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 9733361fed6f..97848cdfcb1a 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -158,53 +158,9 @@ extern void __add_wrong_size(void)
* value of "*ptr".
*
* xadd() is locked when multiple CPUs are online
- * xadd_sync() is always locked
- * xadd_local() is never locked
*/
#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
-#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ")
-#define xadd_local(ptr, inc) __xadd((ptr), (inc), "")
-
-#define __add(ptr, inc, lock) \
- ({ \
- __typeof__ (*(ptr)) __ret = (inc); \
- switch (sizeof(*(ptr))) { \
- case __X86_CASE_B: \
- asm volatile (lock "addb %b1, %0\n" \
- : "+m" (*(ptr)) : "qi" (inc) \
- : "memory", "cc"); \
- break; \
- case __X86_CASE_W: \
- asm volatile (lock "addw %w1, %0\n" \
- : "+m" (*(ptr)) : "ri" (inc) \
- : "memory", "cc"); \
- break; \
- case __X86_CASE_L: \
- asm volatile (lock "addl %1, %0\n" \
- : "+m" (*(ptr)) : "ri" (inc) \
- : "memory", "cc"); \
- break; \
- case __X86_CASE_Q: \
- asm volatile (lock "addq %1, %0\n" \
- : "+m" (*(ptr)) : "ri" (inc) \
- : "memory", "cc"); \
- break; \
- default: \
- __add_wrong_size(); \
- } \
- __ret; \
- })
-
-/*
- * add_*() adds "inc" to "*ptr"
- *
- * __add() takes a lock prefix
- * add_smp() is locked when multiple CPUs are online
- * add_sync() is always locked
- */
-#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX)
-#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ")
#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \
({ \
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index a18806165fe4..03d269bed941 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -275,10 +275,10 @@ struct compat_shmid64_ds {
#ifdef CONFIG_X86_X32_ABI
typedef struct user_regs_struct compat_elf_gregset_t;
-#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216)
-#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296)
-#define SET_PR_FPVALID(S,V) \
- do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \
+/* Full regset -- prstatus on x32, otherwise on ia32 */
+#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
+#define SET_PR_FPVALID(S, V, R) \
+ do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
while (0)
#define COMPAT_USE_64BIT_TIME \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 92a8308b96f6..a39629206864 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -106,7 +106,6 @@
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
@@ -195,6 +194,8 @@
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
+#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
+#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73cf018..12080d87da3b 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -36,7 +36,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
extern struct desc_ptr idt_descr;
extern gate_desc idt_table[];
-extern struct desc_ptr debug_idt_descr;
+extern const struct desc_ptr debug_idt_descr;
extern gate_desc debug_idt_table[];
struct gdt_page {
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3ab0537872fb..476b574de99e 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -10,8 +10,8 @@
#include <uapi/asm/e820.h>
#ifndef __ASSEMBLY__
/* see comment in arch/x86/kernel/e820.c */
-extern struct e820map e820;
-extern struct e820map e820_saved;
+extern struct e820map *e820;
+extern struct e820map *e820_saved;
extern unsigned long pci_mem_start;
extern int e820_any_mapped(u64 start, u64 end, unsigned type);
@@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void);
extern void setup_memory_map(void);
extern char *default_machine_specific_memory_setup(void);
+extern void e820_reallocate_tables(void);
+
/*
* Returns true iff the specified range [s,e) is completely contained inside
* the ISA region.
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index d0bb76d81402..389d700b961e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -117,7 +117,6 @@ extern int __init efi_memblock_x86_reserve_range(void);
extern pgd_t * __init efi_call_phys_prolog(void);
extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
extern void __init efi_print_memmap(void);
-extern void __init efi_unmap_memmap(void);
extern void __init efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
@@ -192,14 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
struct efi_config {
u64 image_handle;
u64 table;
- u64 allocate_pool;
- u64 allocate_pages;
- u64 get_memory_map;
- u64 free_pool;
- u64 free_pages;
- u64 locate_handle;
- u64 handle_protocol;
- u64 exit_boot_services;
+ u64 boot_services;
u64 text_output;
efi_status_t (*call)(unsigned long, ...);
bool is64;
@@ -207,14 +199,27 @@ struct efi_config {
__pure const struct efi_config *__efi_early(void);
+static inline bool efi_is_64bit(void)
+{
+ if (!IS_ENABLED(CONFIG_X86_64))
+ return false;
+
+ if (!IS_ENABLED(CONFIG_EFI_MIXED))
+ return true;
+
+ return __efi_early()->is64;
+}
+
#define efi_call_early(f, ...) \
- __efi_early()->call(__efi_early()->f, __VA_ARGS__);
+ __efi_early()->call(efi_is_64bit() ? \
+ ((efi_boot_services_64_t *)(unsigned long) \
+ __efi_early()->boot_services)->f : \
+ ((efi_boot_services_32_t *)(unsigned long) \
+ __efi_early()->boot_services)->f, __VA_ARGS__)
#define __efi_call_early(f, ...) \
__efi_early()->call((unsigned long)f, __VA_ARGS__);
-#define efi_is_64bit() __efi_early()->is64
-
extern bool efi_reboot_required(void);
#else
diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h
new file mode 100644
index 000000000000..138de56b13eb
--- /dev/null
+++ b/arch/x86/include/asm/export.h
@@ -0,0 +1,4 @@
+#ifdef CONFIG_64BIT
+#define KSYM_ALIGN 16
+#endif
+#include <asm-generic/export.h>
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
new file mode 100644
index 000000000000..b8ad261d11dc
--- /dev/null
+++ b/arch/x86/include/asm/extable.h
@@ -0,0 +1,35 @@
+#ifndef _ASM_X86_EXTABLE_H
+#define _ASM_X86_EXTABLE_H
+/*
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path. This means when everything is well,
+ * we don't even have to jump over them. Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+ int insn, fixup, handler;
+};
+struct pt_regs;
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#define swap_ex_entry_fixup(a, b, tmp, delta) \
+ do { \
+ (a)->fixup = (b)->fixup + (delta); \
+ (b)->fixup = (tmp).fixup - (delta); \
+ (a)->handler = (b)->handler + (delta); \
+ (b)->handler = (tmp).handler - (delta); \
+ } while (0)
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
+extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
+
+#endif
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
index 0e970d00dfcd..20a1fbf7fe4e 100644
--- a/arch/x86/include/asm/fpu/signal.h
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -19,6 +19,12 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
# define ia32_setup_rt_frame __setup_rt_frame
#endif
+#ifdef CONFIG_COMPAT
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to,
+ const siginfo_t *from, bool x32_ABI);
+#endif
+
+
extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
struct task_struct *tsk);
extern void convert_to_fxsr(struct task_struct *tsk,
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index ae55a43e09c0..430bacf73074 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -27,11 +27,12 @@
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM | \
- XFEATURE_MASK_PKRU)
+ XFEATURE_MASK_Hi16_ZMM)
/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
+#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
+ XFEATURE_MASK_PKRU)
/* All currently supported features */
#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
@@ -45,7 +46,8 @@
extern u64 xfeatures_mask;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+extern void __init update_regset_xstate_info(unsigned int size,
+ u64 xstate_mask);
void fpu__xstate_clear_all_cpu_caps(void);
void *get_xsave_addr(struct xregs_state *xsave, int xstate);
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index a4820d4df617..eccd0ac6bc38 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -6,6 +6,7 @@
# define MCOUNT_ADDR ((unsigned long)(__fentry__))
#else
# define MCOUNT_ADDR ((unsigned long)(mcount))
+# define HAVE_FUNCTION_GRAPH_FP_TEST
#endif
#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
@@ -13,6 +14,8 @@
#define ARCH_SUPPORTS_FTRACE_OPS 1
#endif
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+
#ifndef __ASSEMBLY__
extern void mcount(void);
extern atomic_t modifying_ftrace_code;
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index 055ea9941dd5..67942b6ad4b7 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -43,6 +43,9 @@ struct hypervisor_x86 {
/* X2APIC detection (run once per boot) */
bool (*x2apic_available)(void);
+
+ /* pin current vcpu to specified physical cpu (run rarely) */
+ void (*pin_vcpu)(int);
};
extern const struct hypervisor_x86 *x86_hyper;
@@ -56,6 +59,7 @@ extern const struct hypervisor_x86 x86_hyper_kvm;
extern void init_hypervisor(struct cpuinfo_x86 *c);
extern void init_hypervisor_platform(void);
extern bool hypervisor_x2apic_available(void);
+extern void hypervisor_pin_vcpu(int cpu);
#else
static inline void init_hypervisor(struct cpuinfo_x86 *c) { }
static inline void init_hypervisor_platform(void) { }
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 627719475457..34a46dc076d3 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -56,13 +56,14 @@
#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */
#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
-#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */
-#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */
+#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
+#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Annidale */
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
/* Xeon Phi */
#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */
+#define INTEL_FAM6_XEON_PHI_KNM 0x85 /* Knights Mill */
#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 9d6b097aa73d..5b6753d1f7f4 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -18,6 +18,8 @@
extern int intel_mid_pci_init(void);
extern int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state);
+extern void intel_mid_pwr_power_off(void);
+
#define INTEL_MID_PWR_LSS_OFFSET 4
#define INTEL_MID_PWR_LSS_TYPE (1 << 7)
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 925b605eb5c6..4fb1d0abef95 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -3,6 +3,8 @@
#include <linux/notifier.h>
+#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */
+
#define IPCMSG_WARM_RESET 0xF0
#define IPCMSG_COLD_RESET 0xF1
#define IPCMSG_SOFT_RESET 0xF2
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index e7de5c9a4fbd..16d3fa211962 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -50,8 +50,9 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
extern void init_ISA_irqs(void);
#ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(bool);
-#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
+ bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
#endif
#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index b77f5edb03b0..ac7692dcfa2e 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -4,6 +4,10 @@
#include <asm/processor-flags.h>
#ifndef __ASSEMBLY__
+
+/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
+#define __cpuidle __attribute__((__section__(".cpuidle.text")))
+
/*
* Interrupt control:
*/
@@ -44,12 +48,12 @@ static inline void native_irq_enable(void)
asm volatile("sti": : :"memory");
}
-static inline void native_safe_halt(void)
+static inline __cpuidle void native_safe_halt(void)
{
asm volatile("sti; hlt": : :"memory");
}
-static inline void native_halt(void)
+static inline __cpuidle void native_halt(void)
{
asm volatile("hlt": : :"memory");
}
@@ -86,7 +90,7 @@ static inline notrace void arch_local_irq_enable(void)
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
-static inline void arch_safe_halt(void)
+static inline __cpuidle void arch_safe_halt(void)
{
native_safe_halt();
}
@@ -95,7 +99,7 @@ static inline void arch_safe_halt(void)
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
-static inline void halt(void)
+static inline __cpuidle void halt(void)
{
native_halt();
}
diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
index 2674ee3de748..1052a797d71d 100644
--- a/arch/x86/include/asm/kaslr.h
+++ b/arch/x86/include/asm/kaslr.h
@@ -6,6 +6,7 @@ unsigned long kaslr_get_random_long(const char *purpose);
#ifdef CONFIG_RANDOMIZE_MEMORY
extern unsigned long page_offset_base;
extern unsigned long vmalloc_base;
+extern unsigned long vmemmap_base;
void kernel_randomize_memory(void);
#else
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 1ef9d581b5d9..d31881188431 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -24,8 +24,6 @@ enum die_val {
extern void printk_address(unsigned long address);
extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
-extern void show_trace(struct task_struct *t, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all);
extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index d2434c1cad05..282630e4c6ea 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -210,6 +210,7 @@ struct kexec_entry64_regs {
typedef void crash_vmclear_fn(void);
extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+extern void kdump_nmi_shootdown_cpus(void);
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33ae3a4d0159..4b20f7304b9c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -568,6 +568,7 @@ struct kvm_vcpu_arch {
struct kvm_steal_time steal;
} st;
+ u64 tsc_offset;
u64 last_guest_tsc;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
@@ -701,6 +702,8 @@ struct kvm_hv {
/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
u64 hv_crash_ctl;
+
+ HV_REFERENCE_TSC_PAGE tsc_ref;
};
struct kvm_arch {
@@ -781,54 +784,56 @@ struct kvm_arch {
bool disabled_lapic_found;
/* Struct members for AVIC */
+ u32 avic_vm_id;
u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
+ struct hlist_node hnode;
bool x2apic_format;
bool x2apic_broadcast_quirk_disabled;
};
struct kvm_vm_stat {
- u32 mmu_shadow_zapped;
- u32 mmu_pte_write;
- u32 mmu_pte_updated;
- u32 mmu_pde_zapped;
- u32 mmu_flooded;
- u32 mmu_recycled;
- u32 mmu_cache_miss;
- u32 mmu_unsync;
- u32 remote_tlb_flush;
- u32 lpages;
+ ulong mmu_shadow_zapped;
+ ulong mmu_pte_write;
+ ulong mmu_pte_updated;
+ ulong mmu_pde_zapped;
+ ulong mmu_flooded;
+ ulong mmu_recycled;
+ ulong mmu_cache_miss;
+ ulong mmu_unsync;
+ ulong remote_tlb_flush;
+ ulong lpages;
};
struct kvm_vcpu_stat {
- u32 pf_fixed;
- u32 pf_guest;
- u32 tlb_flush;
- u32 invlpg;
-
- u32 exits;
- u32 io_exits;
- u32 mmio_exits;
- u32 signal_exits;
- u32 irq_window_exits;
- u32 nmi_window_exits;
- u32 halt_exits;
- u32 halt_successful_poll;
- u32 halt_attempted_poll;
- u32 halt_poll_invalid;
- u32 halt_wakeup;
- u32 request_irq_exits;
- u32 irq_exits;
- u32 host_state_reload;
- u32 efer_reload;
- u32 fpu_reload;
- u32 insn_emulation;
- u32 insn_emulation_fail;
- u32 hypercalls;
- u32 irq_injections;
- u32 nmi_injections;
+ u64 pf_fixed;
+ u64 pf_guest;
+ u64 tlb_flush;
+ u64 invlpg;
+
+ u64 exits;
+ u64 io_exits;
+ u64 mmio_exits;
+ u64 signal_exits;
+ u64 irq_window_exits;
+ u64 nmi_window_exits;
+ u64 halt_exits;
+ u64 halt_successful_poll;
+ u64 halt_attempted_poll;
+ u64 halt_poll_invalid;
+ u64 halt_wakeup;
+ u64 request_irq_exits;
+ u64 irq_exits;
+ u64 host_state_reload;
+ u64 efer_reload;
+ u64 fpu_reload;
+ u64 insn_emulation;
+ u64 insn_emulation_fail;
+ u64 hypercalls;
+ u64 irq_injections;
+ u64 nmi_injections;
};
struct x86_instruction_info;
@@ -951,7 +956,6 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
- u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 8bf766ef0e18..9bd7ff5ffbcc 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,9 +40,10 @@
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
/* AMD-specific bits */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */
#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
-#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
/*
* McaX field if set indicates a given bank supports MCA extensions:
@@ -110,6 +111,7 @@
#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
+#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006
#define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008
#define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009
#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
@@ -119,6 +121,7 @@
#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))
@@ -334,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected,
* Scalable MCA.
*/
#ifdef CONFIG_X86_MCE_AMD
-enum amd_ip_types {
- SMCA_F17H_CORE = 0, /* Core errors */
- SMCA_DF, /* Data Fabric */
- SMCA_UMC, /* Unified Memory Controller */
- SMCA_PB, /* Parameter Block */
- SMCA_PSP, /* Platform Security Processor */
- SMCA_SMU, /* System Management Unit */
- N_AMD_IP_TYPES
-};
-
-struct amd_hwid {
- const char *name;
- unsigned int hwid;
-};
-
-extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
-enum amd_core_mca_blocks {
+/* These may be used by multiple smca_hwid_mcatypes */
+enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
SMCA_IF, /* Instruction Fetch */
- SMCA_L2_CACHE, /* L2 cache */
- SMCA_DE, /* Decoder unit */
- RES, /* Reserved */
- SMCA_EX, /* Execution unit */
+ SMCA_L2_CACHE, /* L2 Cache */
+ SMCA_DE, /* Decoder Unit */
+ SMCA_EX, /* Execution Unit */
SMCA_FP, /* Floating Point */
- SMCA_L3_CACHE, /* L3 cache */
- N_CORE_MCA_BLOCKS
+ SMCA_L3_CACHE, /* L3 Cache */
+ SMCA_CS, /* Coherent Slave */
+ SMCA_PIE, /* Power, Interrupts, etc. */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_SMCA_BANK_TYPES
};
-extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+struct smca_bank_name {
+ const char *name; /* Short name for sysfs */
+ const char *long_name; /* Long name for pretty-printing */
+};
+
+extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES];
+
+#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype)
-enum amd_df_mca_blocks {
- SMCA_CS = 0, /* Coherent Slave */
- SMCA_PIE, /* Power management, Interrupts, etc */
- N_DF_BLOCKS
+struct smca_hwid_mcatype {
+ unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
+ u32 hwid_mcatype; /* (hwid,mcatype) tuple */
+ u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */
};
-extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+struct smca_bank_info {
+ struct smca_hwid_mcatype *type;
+ u32 type_instance;
+};
+
+extern struct smca_bank_info smca_banks[MAX_NR_BANKS];
+
#endif
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 1ea0baef1175..72198c64e646 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -23,6 +23,14 @@ typedef struct {
const struct vdso_image *vdso_image; /* vdso image in use */
atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /*
+ * One bit per protection key says whether userspace can
+ * use it or not. protected by mmap_sem.
+ */
+ u16 pkey_allocation_map;
+ s16 execute_only_pkey;
+#endif
} mm_context_t;
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index d8abfcf524d1..8e0a9fe86de4 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -4,6 +4,7 @@
#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
+#include <linux/pkeys.h>
#include <trace/events/tlb.h>
@@ -107,7 +108,16 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
+ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
+ /* pkey 0 is the default and always allocated */
+ mm->context.pkey_allocation_map = 0x1;
+ /* -1 means unallocated or invalid */
+ mm->context.execute_only_pkey = -1;
+ }
+ #endif
init_new_context_ldt(tsk, mm);
+
return 0;
}
static inline void destroy_context(struct mm_struct *mm)
@@ -195,16 +205,20 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
mpx_notify_unmap(mm, vma, start, end);
}
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
static inline int vma_pkey(struct vm_area_struct *vma)
{
- u16 pkey = 0;
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
VM_PKEY_BIT2 | VM_PKEY_BIT3;
- pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
-#endif
- return pkey;
+
+ return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+}
+#else
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+ return 0;
}
+#endif
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
@@ -258,5 +272,4 @@ static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
}
-
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b64578..32007041ef8c 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,7 +6,6 @@
#include <asm/x86_init.h>
#include <asm/apicdef.h>
-extern int apic_version[];
extern int pic_mode;
#ifdef CONFIG_X86_32
@@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
extern unsigned int boot_cpu_physical_apicid;
+extern u8 boot_cpu_apic_version;
extern unsigned long mp_lapic_addr;
#ifdef CONFIG_X86_LOCAL_APIC
@@ -86,6 +86,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
#endif
int generic_processor_info(int apicid, int version);
+int __generic_processor_info(int apicid, int version, bool enabled);
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 56f4c6676b29..78f3760ca1f2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -88,7 +88,6 @@
#define MSR_IA32_RTIT_CTL 0x00000570
#define MSR_IA32_RTIT_STATUS 0x00000571
-#define MSR_IA32_RTIT_STATUS 0x00000571
#define MSR_IA32_RTIT_ADDR0_A 0x00000580
#define MSR_IA32_RTIT_ADDR0_B 0x00000581
#define MSR_IA32_RTIT_ADDR1_A 0x00000582
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 2970d22d7766..ce932812f142 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,10 +80,6 @@ static inline unsigned long __read_cr4(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
}
-static inline unsigned long __read_cr4_safe(void)
-{
- return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
-}
static inline void __write_cr4(unsigned long x)
{
@@ -661,8 +657,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
u32 val)
{
@@ -684,22 +678,6 @@ static __always_inline void pv_kick(int cpu)
PVOP_VCALL1(pv_lock_ops.kick, cpu);
}
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-
-static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
- __ticket_t ticket)
-{
- PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket);
-}
-
-static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
- __ticket_t ticket)
-{
- PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
-}
-
-#endif /* CONFIG_QUEUED_SPINLOCKS */
-
#endif /* SMP && PARAVIRT_SPINLOCKS */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7fa9e7740ba3..0f400c0e4979 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -108,7 +108,6 @@ struct pv_cpu_ops {
unsigned long (*read_cr0)(void);
void (*write_cr0)(unsigned long);
- unsigned long (*read_cr4_safe)(void);
unsigned long (*read_cr4)(void);
void (*write_cr4)(unsigned long);
@@ -301,23 +300,16 @@ struct pv_mmu_ops {
struct arch_spinlock;
#ifdef CONFIG_SMP
#include <asm/spinlock_types.h>
-#else
-typedef u16 __ticket_t;
#endif
struct qspinlock;
struct pv_lock_ops {
-#ifdef CONFIG_QUEUED_SPINLOCKS
void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
struct paravirt_callee_save queued_spin_unlock;
void (*wait)(u8 *ptr, u8 val);
void (*kick)(int cpu);
-#else /* !CONFIG_QUEUED_SPINLOCKS */
- struct paravirt_callee_save lock_spinning;
- void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
};
/* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 9ab7507ca1c2..1411dbed5e5e 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -23,6 +23,9 @@ struct pci_sysdata {
#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
void *fwnode; /* IRQ domain for MSI assignment */
#endif
+#if IS_ENABLED(CONFIG_VMD)
+ bool vmd_domain; /* True if in Intel VMD domain */
+#endif
};
extern int pci_routeirq;
@@ -56,6 +59,17 @@ static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
#define pci_root_bus_fwnode _pci_root_bus_fwnode
#endif
+static inline bool is_vmd(struct pci_bus *bus)
+{
+#if IS_ENABLED(CONFIG_VMD)
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->vmd_domain;
+#else
+ return false;
+#endif
+}
+
/* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
or architectures with incomplete PCI setup by the loader */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e02e3f80d363..84f58de08c2b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -521,7 +521,8 @@ do { \
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
const unsigned long __percpu *addr)
{
- unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+ unsigned long __percpu *a =
+ (unsigned long __percpu *)addr + nr / BITS_PER_LONG;
#ifdef CONFIG_X86_64
return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
@@ -538,7 +539,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
asm volatile("bt "__percpu_arg(2)",%1\n\t"
CC_SET(c)
: CC_OUT(c) (oldbit)
- : "m" (*(unsigned long *)addr), "Ir" (nr));
+ : "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
return oldbit;
}
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 6fdef9eef2d5..3a264200c62f 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -57,11 +57,13 @@ typedef struct { pteval_t pte; } pte_t;
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define VMEMMAP_START _AC(0xffffea0000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
#ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base
+#define VMEMMAP_START vmemmap_base
#else
#define VMALLOC_START __VMALLOC_BASE
+#define VMEMMAP_START __VMEMMAP_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f1218f512f62..8b4de22d6429 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -439,8 +439,6 @@ extern pgprot_t pgprot_writethrough(pgprot_t prot);
struct file;
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot);
-int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
- unsigned long size, pgprot_t *vma_prot);
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 7b84565c916c..34684adb6899 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -10,7 +10,6 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
* Try to dedicate one of the protection keys to be used as an
* execute-only protection key.
*/
-#define PKEY_DEDICATED_EXECUTE_ONLY 15
extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
@@ -31,4 +30,76 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
return __arch_override_mprotect_pkey(vma, prot, pkey);
}
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+
+#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3)
+
+#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
+#define mm_set_pkey_allocated(mm, pkey) do { \
+ mm_pkey_allocation_map(mm) |= (1U << pkey); \
+} while (0)
+#define mm_set_pkey_free(mm, pkey) do { \
+ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \
+} while (0)
+
+static inline
+bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+ return mm_pkey_allocation_map(mm) & (1U << pkey);
+}
+
+/*
+ * Returns a positive, 4-bit key on success, or -1 on failure.
+ */
+static inline
+int mm_pkey_alloc(struct mm_struct *mm)
+{
+ /*
+ * Note: this is the one and only place we make sure
+ * that the pkey is valid as far as the hardware is
+ * concerned. The rest of the kernel trusts that
+ * only good, valid pkeys come out of here.
+ */
+ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1);
+ int ret;
+
+ /*
+ * Are we out of pkeys? We must handle this specially
+ * because ffz() behavior is undefined if there are no
+ * zeros.
+ */
+ if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
+ return -1;
+
+ ret = ffz(mm_pkey_allocation_map(mm));
+
+ mm_set_pkey_allocated(mm, ret);
+
+ return ret;
+}
+
+static inline
+int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+ /*
+ * pkey 0 is special, always allocated and can never
+ * be freed.
+ */
+ if (!pkey)
+ return -EINVAL;
+ if (!mm_pkey_is_allocated(mm, pkey))
+ return -EINVAL;
+
+ mm_set_pkey_free(mm, pkey);
+
+ return 0;
+}
+
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+extern void copy_init_pkru_to_fpregs(void);
+
#endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index 643eba42d620..2c1ebeb4d737 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n)
static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n)
{
- if (static_cpu_has(X86_FEATURE_MCE_RECOVERY))
- return memcpy_mcsafe(dst, src, n);
- memcpy(dst, src, n);
- return 0;
+ return memcpy_mcsafe(dst, src, n);
}
/**
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 63def9537a2d..984a7bf17f6a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -389,9 +389,9 @@ struct thread_struct {
unsigned short fsindex;
unsigned short gsindex;
#endif
-#ifdef CONFIG_X86_32
- unsigned long ip;
-#endif
+
+ u32 status; /* thread synchronous flags */
+
#ifdef CONFIG_X86_64
unsigned long fsbase;
unsigned long gsbase;
@@ -438,6 +438,15 @@ struct thread_struct {
};
/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
+
+/*
* Set IOPL bits in EFLAGS from given mask
*/
static inline void native_set_iopl_mask(unsigned mask)
@@ -724,8 +733,6 @@ static inline void spin_lock_prefetch(const void *x)
.addr_limit = KERNEL_DS, \
}
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
/*
* TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
* This is necessary to guarantee that the entire "struct pt_regs"
@@ -776,17 +783,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
.addr_limit = KERNEL_DS, \
}
-/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-#define thread_saved_pc(t) READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8))
-
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
extern unsigned long KSTK_ESP(struct task_struct *task);
#endif /* CONFIG_X86_64 */
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
unsigned long new_sp);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d019f0cc80ec..3ad741b84072 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
}
static __always_inline
-cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+ u64 tsc)
{
- u64 delta = rdtsc_ordered() - src->tsc_timestamp;
+ u64 delta = tsc - src->tsc_timestamp;
cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
src->tsc_shift);
return src->system_time + offset;
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index b2988c0ed829..230e1903acf0 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -44,9 +44,9 @@ struct trampoline_header {
extern struct real_mode_header *real_mode_header;
extern unsigned char real_mode_blob_end[];
-extern unsigned long init_rsp;
extern unsigned long initial_code;
extern unsigned long initial_gs;
+extern unsigned long initial_stack;
extern unsigned char real_mode_blob[];
extern unsigned char real_mode_relocs[];
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index 8dbc762ad132..a34e0d4b957d 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -103,8 +103,10 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem)
({ \
long tmp; \
struct rw_semaphore* ret; \
+ register void *__sp asm(_ASM_SP); \
+ \
asm volatile("# beginning down_write\n\t" \
- LOCK_PREFIX " xadd %1,(%3)\n\t" \
+ LOCK_PREFIX " xadd %1,(%4)\n\t" \
/* adds 0xffff0001, returns the old value */ \
" test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \
/* was the active mask 0 before? */\
@@ -112,7 +114,7 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem)
" call " slow_path "\n" \
"1:\n" \
"# ending down_write" \
- : "+m" (sem->count), "=d" (tmp), "=a" (ret) \
+ : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \
: "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \
: "memory", "cc"); \
ret; \
@@ -154,7 +156,7 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem)
: "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1),
CC_OUT(e) (result)
: "er" (RWSEM_ACTIVE_WRITE_BIAS)
- : "memory", "cc");
+ : "memory");
return result;
}
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 13b6cdd0af57..2f75f30cb2f6 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,7 @@
#define _ASM_X86_SECTIONS_H
#include <asm-generic/sections.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
extern char __brk_base[], __brk_limit[];
extern struct exception_table_entry __stop___ex_table[];
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index dd1e7d6387ab..8af22be0fe61 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -23,6 +23,10 @@ typedef struct {
unsigned long sig[_NSIG_WORDS];
} sigset_t;
+/* non-uapi in-kernel SA_FLAGS for those indicates ABI for a signal frame */
+#define SA_IA32_ABI 0x02000000u
+#define SA_X32_ABI 0x01000000u
+
#ifndef CONFIG_COMPAT
typedef sigset_t compat_sigset_t;
#endif
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index ebd0c164cd4e..026ea82ecc60 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -39,9 +39,6 @@ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
#endif
-/* Static state in head.S used to set up a CPU */
-extern unsigned long stack_start; /* Initial stack pointer address */
-
struct task_struct;
struct smp_ops {
@@ -50,6 +47,7 @@ struct smp_ops {
void (*smp_cpus_done)(unsigned max_cpus);
void (*stop_other_cpus)(int wait);
+ void (*crash_stop_other_cpus)(void);
void (*smp_send_reschedule)(int cpu);
int (*cpu_up)(unsigned cpu, struct task_struct *tidle);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 587d7914ea4b..19a2224f9e16 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -59,22 +59,19 @@ static inline void native_write_cr3(unsigned long val)
static inline unsigned long native_read_cr4(void)
{
unsigned long val;
- asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
- return val;
-}
-
-static inline unsigned long native_read_cr4_safe(void)
-{
- unsigned long val;
- /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
- * exists, so it will never fail. */
#ifdef CONFIG_X86_32
+ /*
+ * This could fault if CR4 does not exist. Non-existent CR4
+ * is functionally equivalent to CR4 == 0. Keep it simple and pretend
+ * that CR4 == 0 on CPUs that don't have CR4.
+ */
asm volatile("1: mov %%cr4, %0\n"
"2:\n"
_ASM_EXTABLE(1b, 2b)
: "=r" (val), "=m" (__force_order) : "0" (0));
#else
- val = native_read_cr4();
+ /* CR4 always exists on x86_64. */
+ asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
#endif
return val;
}
@@ -182,11 +179,6 @@ static inline unsigned long __read_cr4(void)
return native_read_cr4();
}
-static inline unsigned long __read_cr4_safe(void)
-{
- return native_read_cr4_safe();
-}
-
static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index be0a05913b91..921bea7a2708 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -20,187 +20,13 @@
* (the type definitions are in asm/spinlock_types.h)
*/
-#ifdef CONFIG_X86_32
-# define LOCK_PTR_REG "a"
-#else
-# define LOCK_PTR_REG "D"
-#endif
-
-#if defined(CONFIG_X86_32) && (defined(CONFIG_X86_PPRO_FENCE))
-/*
- * On PPro SMP, we use a locked operation to unlock
- * (PPro errata 66, 92)
- */
-# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
-#else
-# define UNLOCK_LOCK_PREFIX
-#endif
-
/* How long a lock should spin before we consider blocking */
#define SPIN_THRESHOLD (1 << 15)
extern struct static_key paravirt_ticketlocks_enabled;
static __always_inline bool static_key_false(struct static_key *key);
-#ifdef CONFIG_QUEUED_SPINLOCKS
#include <asm/qspinlock.h>
-#else
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-
-static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
-{
- set_bit(0, (volatile unsigned long *)&lock->tickets.head);
-}
-
-#else /* !CONFIG_PARAVIRT_SPINLOCKS */
-static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock,
- __ticket_t ticket)
-{
-}
-static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
- __ticket_t ticket)
-{
-}
-
-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-static inline int __tickets_equal(__ticket_t one, __ticket_t two)
-{
- return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
-}
-
-static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
- __ticket_t head)
-{
- if (head & TICKET_SLOWPATH_FLAG) {
- arch_spinlock_t old, new;
-
- old.tickets.head = head;
- new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
- old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
- new.tickets.tail = old.tickets.tail;
-
- /* try to clear slowpath flag when there are no contenders */
- cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
- }
-}
-
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
- return __tickets_equal(lock.tickets.head, lock.tickets.tail);
-}
-
-/*
- * Ticket locks are conceptually two parts, one indicating the current head of
- * the queue, and the other indicating the current tail. The lock is acquired
- * by atomically noting the tail and incrementing it by one (thus adding
- * ourself to the queue and noting our position), then waiting until the head
- * becomes equal to the the initial value of the tail.
- *
- * We use an xadd covering *both* parts of the lock, to increment the tail and
- * also load the position of the head, which takes care of memory ordering
- * issues and should be optimal for the uncontended case. Note the tail must be
- * in the high part, because a wide xadd increment of the low part would carry
- * up and contaminate the high part.
- */
-static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
-{
- register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC };
-
- inc = xadd(&lock->tickets, inc);
- if (likely(inc.head == inc.tail))
- goto out;
-
- for (;;) {
- unsigned count = SPIN_THRESHOLD;
-
- do {
- inc.head = READ_ONCE(lock->tickets.head);
- if (__tickets_equal(inc.head, inc.tail))
- goto clear_slowpath;
- cpu_relax();
- } while (--count);
- __ticket_lock_spinning(lock, inc.tail);
- }
-clear_slowpath:
- __ticket_check_and_clear_slowpath(lock, inc.head);
-out:
- barrier(); /* make sure nothing creeps before the lock is taken */
-}
-
-static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
- arch_spinlock_t old, new;
-
- old.tickets = READ_ONCE(lock->tickets);
- if (!__tickets_equal(old.tickets.head, old.tickets.tail))
- return 0;
-
- new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
- new.head_tail &= ~TICKET_SLOWPATH_FLAG;
-
- /* cmpxchg is a full barrier, so nothing can move before it */
- return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
-}
-
-static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
- if (TICKET_SLOWPATH_FLAG &&
- static_key_false(&paravirt_ticketlocks_enabled)) {
- __ticket_t head;
-
- BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
- head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
-
- if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
- head &= ~TICKET_SLOWPATH_FLAG;
- __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
- }
- } else
- __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
-}
-
-static inline int arch_spin_is_locked(arch_spinlock_t *lock)
-{
- struct __raw_tickets tmp = READ_ONCE(lock->tickets);
-
- return !__tickets_equal(tmp.tail, tmp.head);
-}
-
-static inline int arch_spin_is_contended(arch_spinlock_t *lock)
-{
- struct __raw_tickets tmp = READ_ONCE(lock->tickets);
-
- tmp.head &= ~TICKET_SLOWPATH_FLAG;
- return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
-}
-#define arch_spin_is_contended arch_spin_is_contended
-
-static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
- unsigned long flags)
-{
- arch_spin_lock(lock);
-}
-
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
- __ticket_t head = READ_ONCE(lock->tickets.head);
-
- for (;;) {
- struct __raw_tickets tmp = READ_ONCE(lock->tickets);
- /*
- * We need to check "unlocked" in a loop, tmp.head == head
- * can be false positive because of overflow.
- */
- if (__tickets_equal(tmp.head, tmp.tail) ||
- !__tickets_equal(tmp.head, head))
- break;
-
- cpu_relax();
- }
-}
-#endif /* CONFIG_QUEUED_SPINLOCKS */
/*
* Read-write spinlocks, allowing multiple readers
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 65c3e37f879a..25311ebb446c 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -23,20 +23,7 @@ typedef u32 __ticketpair_t;
#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
-#ifdef CONFIG_QUEUED_SPINLOCKS
#include <asm-generic/qspinlock_types.h>
-#else
-typedef struct arch_spinlock {
- union {
- __ticketpair_t head_tail;
- struct __raw_tickets {
- __ticket_t head, tail;
- } tickets;
- };
-} arch_spinlock_t;
-
-#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
-#endif /* CONFIG_QUEUED_SPINLOCKS */
#include <asm-generic/qrwlock_types.h>
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 0944218af9e2..37f2e0b377ad 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -8,86 +8,86 @@
#include <linux/uaccess.h>
#include <linux/ptrace.h>
+#include <asm/switch_to.h>
+
+enum stack_type {
+ STACK_TYPE_UNKNOWN,
+ STACK_TYPE_TASK,
+ STACK_TYPE_IRQ,
+ STACK_TYPE_SOFTIRQ,
+ STACK_TYPE_EXCEPTION,
+ STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
+};
-extern int kstack_depth_to_print;
-
-struct thread_info;
-struct stacktrace_ops;
-
-typedef unsigned long (*walk_stack_t)(struct task_struct *task,
- unsigned long *stack,
- unsigned long bp,
- const struct stacktrace_ops *ops,
- void *data,
- unsigned long *end,
- int *graph);
-
-extern unsigned long
-print_context_stack(struct task_struct *task,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph);
-
-extern unsigned long
-print_context_stack_bp(struct task_struct *task,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph);
-
-/* Generic stack tracer with callbacks */
-
-struct stacktrace_ops {
- int (*address)(void *data, unsigned long address, int reliable);
- /* On negative return stop dumping */
- int (*stack)(void *data, char *name);
- walk_stack_t walk_stack;
+struct stack_info {
+ enum stack_type type;
+ unsigned long *begin, *end, *next_sp;
};
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data);
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info);
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask);
+
+void stack_type_str(enum stack_type type, const char **begin,
+ const char **end);
+
+static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
+{
+ void *begin = info->begin;
+ void *end = info->end;
+
+ return (info->type != STACK_TYPE_UNKNOWN &&
+ addr >= begin && addr < end &&
+ addr + len > begin && addr + len <= end);
+}
+
+extern int kstack_depth_to_print;
#ifdef CONFIG_X86_32
#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
#else
#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
#endif
#ifdef CONFIG_FRAME_POINTER
-static inline unsigned long
-stack_frame(struct task_struct *task, struct pt_regs *regs)
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
- unsigned long bp;
-
if (regs)
- return regs->bp;
+ return (unsigned long *)regs->bp;
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- return bp;
- }
+ if (task == current)
+ return __builtin_frame_address(0);
- /* bp is the last reg pushed by switch_to */
- return *(unsigned long *)task->thread.sp;
+ return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp;
}
#else
-static inline unsigned long
-stack_frame(struct task_struct *task, struct pt_regs *regs)
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
{
- return 0;
+ return NULL;
+}
+#endif /* CONFIG_FRAME_POINTER */
+
+static inline unsigned long *
+get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+ if (regs)
+ return (unsigned long *)kernel_stack_pointer(regs);
+
+ if (task == current)
+ return __builtin_frame_address(0);
+
+ return (unsigned long *)task->thread.sp;
}
-#endif
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl);
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, char *log_lvl);
extern unsigned int code_bytes;
@@ -106,7 +106,7 @@ static inline unsigned long caller_frame_pointer(void)
{
struct stack_frame *frame;
- get_bp(frame);
+ frame = __builtin_frame_address(0);
#ifdef CONFIG_FRAME_POINTER
frame = frame->next_frame;
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 90dbbd9666d4..a164862d77e3 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -2,6 +2,7 @@
#define _ASM_X86_STRING_64_H
#ifdef __KERNEL__
+#include <linux/jump_label.h>
/* Written 2002 by Andi Kleen */
@@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct);
#define memset(s, c, n) __memset(s, c, n)
#endif
+__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt);
+DECLARE_STATIC_KEY_FALSE(mcsafe_key);
+
/**
* memcpy_mcsafe - copy memory with indication if a machine check happened
*
@@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct);
* @cnt: number of bytes to copy
*
* Low level memory copy function that catches machine checks
+ * We only call into the "safe" function on systems that can
+ * actually do machine check recovery. Everyone else can just
+ * use memcpy().
*
* Return 0 for success, -EFAULT for fail
*/
-int memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+static __always_inline __must_check int
+memcpy_mcsafe(void *dst, const void *src, size_t cnt)
+{
+#ifdef CONFIG_X86_MCE
+ if (static_branch_unlikely(&mcsafe_key))
+ return memcpy_mcsafe_unrolled(dst, src, cnt);
+ else
+#endif
+ memcpy(dst, src, cnt);
+ return 0;
+}
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8f321a1b03a1..5cb436acd463 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -2,130 +2,66 @@
#define _ASM_X86_SWITCH_TO_H
struct task_struct; /* one of the stranger aspects of C forward declarations */
+
+struct task_struct *__switch_to_asm(struct task_struct *prev,
+ struct task_struct *next);
+
__visible struct task_struct *__switch_to(struct task_struct *prev,
- struct task_struct *next);
+ struct task_struct *next);
struct tss_struct;
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss);
-#ifdef CONFIG_X86_32
+/* This runs runs on the previous thread's stack. */
+static inline void prepare_switch_to(struct task_struct *prev,
+ struct task_struct *next)
+{
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we switch to a stack that has a top-level paging entry
+ * that is not present in the current mm, the resulting #PF will
+ * will be promoted to a double-fault and we'll panic. Probe
+ * the new stack now so that vmalloc_fault can fix up the page
+ * tables if needed. This can only happen if we use a stack
+ * in vmap space.
+ *
+ * We assume that the stack is aligned so that it never spans
+ * more than one top-level paging entry.
+ *
+ * To minimize cache pollution, just follow the stack pointer.
+ */
+ READ_ONCE(*(unsigned char *)next->thread.sp);
+#endif
+}
+
+asmlinkage void ret_from_fork(void);
+
+/* data that is pointed to by thread.sp */
+struct inactive_task_frame {
+#ifdef CONFIG_X86_64
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+#else
+ unsigned long si;
+ unsigned long di;
+#endif
+ unsigned long bx;
+ unsigned long bp;
+ unsigned long ret_addr;
+};
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movl %P[task_canary](%[next]), %%ebx\n\t" \
- "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
-#define __switch_canary_oparam \
- , [stack_canary] "=m" (stack_canary.canary)
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
+struct fork_frame {
+ struct inactive_task_frame frame;
+ struct pt_regs regs;
+};
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
#define switch_to(prev, next, last) \
do { \
- /* \
- * Context-switching clobbers all registers, so we clobber \
- * them explicitly, via unused output variables. \
- * (EAX and EBP is not listed because EBP is saved/restored \
- * explicitly for wchan access and EAX is the return value of \
- * __switch_to()) \
- */ \
- unsigned long ebx, ecx, edx, esi, edi; \
- \
- asm volatile("pushl %%ebp\n\t" /* save EBP */ \
- "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
- "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
- "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
- "pushl %[next_ip]\n\t" /* restore EIP */ \
- __switch_canary \
- "jmp __switch_to\n" /* regparm call */ \
- "1:\t" \
- "popl %%ebp\n\t" /* restore EBP */ \
- \
- /* output parameters */ \
- : [prev_sp] "=m" (prev->thread.sp), \
- [prev_ip] "=m" (prev->thread.ip), \
- "=a" (last), \
- \
- /* clobbered output registers: */ \
- "=b" (ebx), "=c" (ecx), "=d" (edx), \
- "=S" (esi), "=D" (edi) \
- \
- __switch_canary_oparam \
- \
- /* input parameters: */ \
- : [next_sp] "m" (next->thread.sp), \
- [next_ip] "m" (next->thread.ip), \
- \
- /* regparm parameters for __switch_to(): */ \
- [prev] "a" (prev), \
- [next] "d" (next) \
+ prepare_switch_to(prev, next); \
\
- __switch_canary_iparam \
- \
- : /* reloaded segment registers */ \
- "memory"); \
+ ((last) = __switch_to_asm((prev), (next))); \
} while (0)
-#else /* CONFIG_X86_32 */
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
-
-#define __EXTRA_CLOBBER \
- , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
- "r12", "r13", "r14", "r15", "flags"
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary \
- "movq %P[task_canary](%%rsi),%%r8\n\t" \
- "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
-#define __switch_canary_oparam \
- , [gs_canary] "=m" (irq_stack_union.stack_canary)
-#define __switch_canary_iparam \
- , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/*
- * There is no need to save or restore flags, because flags are always
- * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
- * has no effect.
- */
-#define switch_to(prev, next, last) \
- asm volatile(SAVE_CONTEXT \
- "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
- "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
- "call __switch_to\n\t" \
- "movq "__percpu_arg([current_task])",%%rsi\n\t" \
- __switch_canary \
- "movq %P[thread_info](%%rsi),%%r8\n\t" \
- "movq %%rax,%%rdi\n\t" \
- "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
- "jnz ret_from_fork\n\t" \
- RESTORE_CONTEXT \
- : "=a" (last) \
- __switch_canary_oparam \
- : [next] "S" (next), [prev] "D" (prev), \
- [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
- [ti_flags] "i" (offsetof(struct thread_info, flags)), \
- [_tif_fork] "i" (_TIF_FORK), \
- [thread_info] "i" (offsetof(struct task_struct, stack)), \
- [current_task] "m" (current_task) \
- __switch_canary_iparam \
- : "memory", "cc" __EXTRA_CLOBBER)
-
-#endif /* CONFIG_X86_32 */
-
#endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 4e23dd15c661..e3c95e8e61c5 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
* TS_COMPAT is set for 32-bit syscall entries and then
* remains set until we return to user mode.
*/
- if (task_thread_info(task)->status & (TS_COMPAT|TS_I386_REGS_POKED))
+ if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
/*
* Sign-extend the value so (int)-EFOO becomes (long)-EFOO
* and will match correctly in comparisons.
@@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task_thread_info(task)->status & TS_COMPAT)
+ if (task->thread.status & TS_COMPAT)
switch (i) {
case 0:
if (!n--) break;
@@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
const unsigned long *args)
{
# ifdef CONFIG_IA32_EMULATION
- if (task_thread_info(task)->status & TS_COMPAT)
+ if (task->thread.status & TS_COMPAT)
switch (i) {
case 0:
if (!n--) break;
@@ -234,18 +234,8 @@ static inline void syscall_set_arguments(struct task_struct *task,
static inline int syscall_get_arch(void)
{
-#ifdef CONFIG_IA32_EMULATION
- /*
- * TS_COMPAT is set for 32-bit syscall entry and then
- * remains set until we return to user mode.
- *
- * x32 tasks should be considered AUDIT_ARCH_X86_64.
- */
- if (task_thread_info(current)->status & TS_COMPAT)
- return AUDIT_ARCH_I386;
-#endif
- /* Both x32 and x86_64 are considered "64-bit". */
- return AUDIT_ARCH_X86_64;
+ /* x32 tasks should be considered AUDIT_ARCH_X86_64. */
+ return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 8b7c8d8e0852..ad6f5eb07a95 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -53,20 +53,14 @@ struct task_struct;
#include <linux/atomic.h>
struct thread_info {
- struct task_struct *task; /* main task structure */
- __u32 flags; /* low level flags */
- __u32 status; /* thread synchronous flags */
- __u32 cpu; /* current CPU */
+ unsigned long flags; /* low level flags */
};
#define INIT_THREAD_INFO(tsk) \
{ \
- .task = &tsk, \
.flags = 0, \
- .cpu = 0, \
}
-#define init_thread_info (init_thread_union.thread_info)
#define init_stack (init_thread_union.stack)
#else /* !__ASSEMBLY__ */
@@ -95,7 +89,6 @@ struct thread_info {
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
#define TIF_IA32 17 /* IA32 compatibility process */
-#define TIF_FORK 18 /* ret_from_fork */
#define TIF_NOHZ 19 /* in adaptive nohz mode */
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */
@@ -119,7 +112,6 @@ struct thread_info {
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
#define _TIF_IA32 (1 << TIF_IA32)
-#define _TIF_FORK (1 << TIF_FORK)
#define _TIF_NOHZ (1 << TIF_NOHZ)
#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
@@ -160,11 +152,6 @@ struct thread_info {
*/
#ifndef __ASSEMBLY__
-static inline struct thread_info *current_thread_info(void)
-{
- return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
-}
-
static inline unsigned long current_stack_pointer(void)
{
unsigned long sp;
@@ -226,60 +213,19 @@ static inline int arch_within_stack_frames(const void * const stack,
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
#endif
-/*
- * ASM operand which evaluates to a 'thread_info' address of
- * the current task, if it is known that "reg" is exactly "off"
- * bytes below the top of the stack currently.
- *
- * ( The kernel stack's size is known at build time, it is usually
- * 2 or 4 pages, and the bottom of the kernel stack contains
- * the thread_info structure. So to access the thread_info very
- * quickly from assembly code we can calculate down from the
- * top of the kernel stack to the bottom, using constant,
- * build-time calculations only. )
- *
- * For example, to fetch the current thread_info->flags value into %eax
- * on x86-64 defconfig kernels, in syscall entry code where RSP is
- * currently at exactly SIZEOF_PTREGS bytes away from the top of the
- * stack:
- *
- * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
- *
- * will translate to:
- *
- * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax
- *
- * which is below the current RSP by almost 16K.
- */
-#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
-
#endif
-/*
- * Thread-synchronous status.
- *
- * This is different from the flags in that nobody else
- * ever touches our thread-synchronous status, so we don't
- * have to worry about atomic accesses.
- */
-#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
#ifdef CONFIG_COMPAT
#define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */
#endif
-
#ifndef __ASSEMBLY__
-static inline bool in_ia32_syscall(void)
-{
#ifdef CONFIG_X86_32
- return true;
-#endif
-#ifdef CONFIG_IA32_EMULATION
- if (current_thread_info()->status & TS_COMPAT)
- return true;
+#define in_ia32_syscall() true
+#else
+#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
+ current->thread.status & TS_COMPAT)
#endif
- return false;
-}
/*
* Force syscall return via IRET by making it look as if there was
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index c3496619740a..01fd0a7f48cd 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -117,6 +117,12 @@ extern void ist_exit(struct pt_regs *regs);
extern void ist_begin_non_atomic(struct pt_regs *regs);
extern void ist_end_non_atomic(void);
+#ifdef CONFIG_VMAP_STACK
+void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address);
+#endif
+
/* Interrupts/Exceptions */
enum {
X86_TRAP_DE = 0, /* 0, Divide-by-zero */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2131c4ce7d8a..faf3687f1035 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -11,6 +11,7 @@
#include <asm/asm.h>
#include <asm/page.h>
#include <asm/smap.h>
+#include <asm/extable.h>
#define VERIFY_READ 0
#define VERIFY_WRITE 1
@@ -91,37 +92,6 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
likely(!__range_not_ok(addr, size, user_addr_max()))
/*
- * The exception table consists of triples of addresses relative to the
- * exception table entry itself. The first address is of an instruction
- * that is allowed to fault, the second is the target at which the program
- * should continue. The third is a handler function to deal with the fault
- * caused by the instruction in the first field.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path. This means when everything is well,
- * we don't even have to jump over them. Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
- int insn, fixup, handler;
-};
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-#define swap_ex_entry_fixup(a, b, tmp, delta) \
- do { \
- (a)->fixup = (b)->fixup + (delta); \
- (b)->fixup = (tmp).fixup - (delta); \
- (a)->handler = (b)->handler + (delta); \
- (b)->handler = (tmp).handler - (delta); \
- } while (0)
-
-extern int fixup_exception(struct pt_regs *regs, int trapnr);
-extern bool ex_has_fault_handler(unsigned long ip);
-extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
-
-/*
* These are the main single-value transfer routines. They automatically
* use the right size if we just have the right pointer type.
*
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644
index 000000000000..46de9ac4b990
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,63 @@
+#ifndef _ASM_X86_UNWIND_H
+#define _ASM_X86_UNWIND_H
+
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+
+struct unwind_state {
+ struct stack_info stack_info;
+ unsigned long stack_mask;
+ struct task_struct *task;
+ int graph_idx;
+#ifdef CONFIG_FRAME_POINTER
+ unsigned long *bp;
+#else
+ unsigned long *sp;
+#endif
+};
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame);
+
+bool unwind_next_frame(struct unwind_state *state);
+
+unsigned long unwind_get_return_address(struct unwind_state *state);
+
+static inline bool unwind_done(struct unwind_state *state)
+{
+ return state->stack_info.type == STACK_TYPE_UNKNOWN;
+}
+
+static inline
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ first_frame = first_frame ? : get_stack_pointer(task, regs);
+
+ __unwind_start(state, task, regs, first_frame);
+}
+
+#ifdef CONFIG_FRAME_POINTER
+
+static inline
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->bp + 1;
+}
+
+#else /* !CONFIG_FRAME_POINTER */
+
+static inline
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_FRAME_POINTER */
+
+#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index cc44d926c17e..57ab86d94d64 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -49,14 +49,12 @@
#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \
UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
#define UV_DESC_PSHIFT 49
-#define UV_PAYLOADQ_PNODE_SHIFT 49
+#define UV_PAYLOADQ_GNODE_SHIFT 49
#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
#define UV_BAU_TUNABLES_DIR "sgi_uv"
#define UV_BAU_TUNABLES_FILE "bau_tunables"
#define WHITESPACE " \t\n"
-#define uv_mmask ((1UL << uv_hub_info->m_val) - 1)
-#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
#define cpubit_isset(cpu, bau_local_cpumask) \
test_bit((cpu), (bau_local_cpumask).bits)
@@ -387,6 +385,17 @@ struct uv2_3_bau_msg_header {
/* bits 127:120 */
};
+/* Abstracted BAU functions */
+struct bau_operations {
+ unsigned long (*read_l_sw_ack)(void);
+ unsigned long (*read_g_sw_ack)(int pnode);
+ unsigned long (*bau_gpa_to_offset)(unsigned long vaddr);
+ void (*write_l_sw_ack)(unsigned long mmr);
+ void (*write_g_sw_ack)(int pnode, unsigned long mmr);
+ void (*write_payload_first)(int pnode, unsigned long mmr);
+ void (*write_payload_last)(int pnode, unsigned long mmr);
+};
+
/*
* The activation descriptor:
* The format of the message to send, plus all accompanying control
@@ -655,6 +664,16 @@ static inline void write_gmmr_activation(int pnode, unsigned long mmr_image)
write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
}
+static inline void write_mmr_proc_payload_first(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_FIRST, mmr_image);
+}
+
+static inline void write_mmr_proc_payload_last(int pnode, unsigned long mmr_image)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_QUEUE_LAST, mmr_image);
+}
+
static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image)
{
write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
@@ -700,6 +719,26 @@ static inline unsigned long read_gmmr_sw_ack(int pnode)
return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
}
+static inline void write_mmr_proc_sw_ack(unsigned long mr)
+{
+ uv_write_local_mmr(UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr);
+}
+
+static inline void write_gmmr_proc_sw_ack(int pnode, unsigned long mr)
+{
+ write_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_CLEAR, mr);
+}
+
+static inline unsigned long read_mmr_proc_sw_ack(void)
+{
+ return read_lmmr(UV4H_LB_PROC_INTD_SOFT_ACK_PENDING);
+}
+
+static inline unsigned long read_gmmr_proc_sw_ack(int pnode)
+{
+ return read_gmmr(pnode, UV4H_LB_PROC_INTD_SOFT_ACK_PENDING);
+}
+
static inline void write_mmr_data_config(int pnode, unsigned long mr)
{
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 43dc55be524e..2444189cbe28 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -41,6 +41,8 @@ extern const struct vdso_image vdso_image_32;
extern void __init init_vdso_image(const struct vdso_image *image);
+extern int map_vdso_once(const struct vdso_image *image, unsigned long addr);
+
#endif /* __ASSEMBLER__ */
#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index e6911caf5bbf..608a79d5a466 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,15 +20,4 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
/* No need for a barrier -- XCHG is a barrier on x86. */
#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
-extern int xen_have_vector_callback;
-
-/*
- * Events delivered via platform PCI interrupts are always
- * routed to vcpu 0 and hence cannot be rebound.
- */
-static inline bool xen_support_evtchn_rebind(void)
-{
- return (!xen_hvm_domain() || xen_have_vector_callback);
-}
-
#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 2184943341bf..69a6e07e3149 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -26,6 +26,8 @@ struct mce {
__u32 socketid; /* CPU socket ID */
__u32 apicid; /* CPU initial apic ID */
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
+ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
+ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
};
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 3ac5032fae09..ae135de547f5 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -6,4 +6,10 @@
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# define ARCH_MAP_VDSO_X32 0x2001
+# define ARCH_MAP_VDSO_32 0x2002
+# define ARCH_MAP_VDSO_64 0x2003
+#endif
+
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0503f5bfb18d..79076d75bdbf 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -46,9 +46,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
-obj-$(CONFIG_X86_32) += i386_ksyms_32.o
-obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64) += mcount_64.o
+obj-$(CONFIG_X86_64) += sys_x86_64.o mcount_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
@@ -83,6 +81,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
@@ -125,6 +124,12 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
+ifdef CONFIG_FRAME_POINTER
+obj-y += unwind_frame.o
+else
+obj-y += unwind_guess.o
+endif
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 3242e591fa82..26b78d86f25a 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,6 +1,7 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
+obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 90d84c3eee53..8a5abaa7d453 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -176,15 +176,10 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
return -EINVAL;
}
- if (!enabled) {
- ++disabled_cpus;
- return -EINVAL;
- }
-
if (boot_cpu_physical_apicid != -1U)
- ver = apic_version[boot_cpu_physical_apicid];
+ ver = boot_cpu_apic_version;
- cpu = generic_processor_info(id, ver);
+ cpu = __generic_processor_info(id, ver, enabled);
if (cpu >= 0)
early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid;
@@ -238,6 +233,10 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
acpi_table_print_madt_entry(header);
+ /* Ignore invalid ID */
+ if (processor->id == 0xff)
+ return 0;
+
/*
* We need to register disabled CPU as well to permit
* counting disabled CPUs. This allows us to size
@@ -282,6 +281,8 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header,
if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
return -EINVAL;
+ acpi_table_print_madt_entry(header);
+
acpi_lapic_addr = lapic_addr_ovr->address;
return 0;
@@ -705,7 +706,7 @@ static void __init acpi_set_irq_model_ioapic(void)
#ifdef CONFIG_ACPI_HOTPLUG_CPU
#include <acpi/processor.h>
-static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
+int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
{
#ifdef CONFIG_ACPI_NUMA
int nid;
@@ -716,6 +717,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
numa_set_node(cpu, nid);
}
#endif
+ return 0;
}
int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
@@ -998,21 +1000,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return -ENODEV;
- /*
- * Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
- */
-
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
- if (count < 0) {
- printk(KERN_ERR PREFIX
- "Error parsing LAPIC address override entry\n");
- return count;
- }
-
- register_lapic_address(acpi_lapic_addr);
-
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
acpi_parse_sapic, MAX_LOCAL_APIC);
@@ -1031,8 +1018,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
return ret;
}
- x2count = madt_proc[0].count;
- count = madt_proc[1].count;
+ count = madt_proc[0].count;
+ x2count = madt_proc[1].count;
}
if (!count && !x2count) {
printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -1513,7 +1500,7 @@ void __init acpi_boot_table_init(void)
* If acpi_disabled, bail out
*/
if (acpi_disabled)
- return;
+ return;
/*
* Initialize the ACPI boot-time table parser.
diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c
new file mode 100644
index 000000000000..6fb478bf82fd
--- /dev/null
+++ b/arch/x86/kernel/acpi/cppc_msr.c
@@ -0,0 +1,58 @@
+/*
+ * cppc_msr.c: MSR Interface for CPPC
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <acpi/cppc_acpi.h>
+#include <asm/msr.h>
+
+/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
+
+bool cpc_ffh_supported(void)
+{
+ return true;
+}
+
+int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
+{
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ *val &= mask;
+ *val >>= reg->bit_offset;
+ }
+ return err;
+}
+
+int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
+{
+ u64 rd_val;
+ int err;
+
+ err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val);
+ if (!err) {
+ u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1,
+ reg->bit_offset);
+
+ val <<= reg->bit_offset;
+ val &= mask;
+ rd_val &= ~mask;
+ rd_val |= val;
+ err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val);
+ }
+ return err;
+}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index bdfad642123f..af15f4444330 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -152,7 +152,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
+void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
struct cstate_entry *percpu_entry;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index adb3eaf8fe2a..48587335ede8 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void)
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
#ifdef CONFIG_SMP
- stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
+ initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f3e9b2df4b16..88c657b057e2 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -64,6 +64,8 @@ unsigned disabled_cpus;
unsigned int boot_cpu_physical_apicid = -1U;
EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
+u8 boot_cpu_apic_version;
+
/*
* The highest APIC ID seen during enumeration.
*/
@@ -1374,7 +1376,6 @@ void setup_local_APIC(void)
* Actually disabling the focus CPU check just makes the hang less
* frequent as it makes the interrupt distributon model be more
* like LRU than MRU (the short-term load is more even across CPUs).
- * See also the comment in end_level_ioapic_irq(). --macro
*/
/*
@@ -1816,8 +1817,7 @@ void __init init_apic_mappings(void)
* since smp_sanity_check is prepared for such a case
* and disable smp mode
*/
- apic_version[new_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
}
@@ -1828,17 +1828,14 @@ void __init register_lapic_address(unsigned long address)
if (!x2apic_mode) {
set_fixmap_nocache(FIX_APIC_BASE, address);
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, mp_lapic_addr);
+ APIC_BASE, address);
}
if (boot_cpu_physical_apicid == -1U) {
boot_cpu_physical_apicid = read_apic_id();
- apic_version[boot_cpu_physical_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
+ boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
}
-int apic_version[MAX_LOCAL_APIC];
-
/*
* Local APIC interrupts
*/
@@ -2027,7 +2024,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-int generic_processor_info(int apicid, int version)
+/*
+ * The number of allocated logical CPU IDs. Since logical CPU IDs are allocated
+ * contiguously, it equals to current allocated max logical CPU ID plus 1.
+ * All allocated CPU ID should be in [0, nr_logical_cpuidi), so the maximum of
+ * nr_logical_cpuids is nr_cpu_ids.
+ *
+ * NOTE: Reserve 0 for BSP.
+ */
+static int nr_logical_cpuids = 1;
+
+/*
+ * Used to store mapping between logical CPU IDs and APIC IDs.
+ */
+static int cpuid_to_apicid[] = {
+ [0 ... NR_CPUS - 1] = -1,
+};
+
+/*
+ * Should use this API to allocate logical CPU IDs to keep nr_logical_cpuids
+ * and cpuid_to_apicid[] synchronized.
+ */
+static int allocate_logical_cpuid(int apicid)
+{
+ int i;
+
+ /*
+ * cpuid <-> apicid mapping is persistent, so when a cpu is up,
+ * check if the kernel has allocated a cpuid for it.
+ */
+ for (i = 0; i < nr_logical_cpuids; i++) {
+ if (cpuid_to_apicid[i] == apicid)
+ return i;
+ }
+
+ /* Allocate a new cpuid. */
+ if (nr_logical_cpuids >= nr_cpu_ids) {
+ WARN_ONCE(1, "Only %d processors supported."
+ "Processor %d/0x%x and the rest are ignored.\n",
+ nr_cpu_ids - 1, nr_logical_cpuids, apicid);
+ return -1;
+ }
+
+ cpuid_to_apicid[nr_logical_cpuids] = apicid;
+ return nr_logical_cpuids++;
+}
+
+int __generic_processor_info(int apicid, int version, bool enabled)
{
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
@@ -2085,9 +2128,11 @@ int generic_processor_info(int apicid, int version)
if (num_processors >= nr_cpu_ids) {
int thiscpu = max + disabled_cpus;
- pr_warning(
- "APIC: NR_CPUS/possible_cpus limit of %i reached."
- " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+ if (enabled) {
+ pr_warning("APIC: NR_CPUS/possible_cpus limit of %i "
+ "reached. Processor %d/0x%x ignored.\n",
+ max, thiscpu, apicid);
+ }
disabled_cpus++;
return -EINVAL;
@@ -2102,8 +2147,16 @@ int generic_processor_info(int apicid, int version)
* for BSP.
*/
cpu = 0;
- } else
- cpu = cpumask_next_zero(-1, cpu_present_mask);
+
+ /* Logical cpuid 0 is reserved for BSP. */
+ cpuid_to_apicid[0] = apicid;
+ } else {
+ cpu = allocate_logical_cpuid(apicid);
+ if (cpu < 0) {
+ disabled_cpus++;
+ return -EINVAL;
+ }
+ }
/*
* This can happen on physical hotplug. The sanity check at boot time
@@ -2120,8 +2173,6 @@ int generic_processor_info(int apicid, int version)
return -ENOSPC;
}
- num_processors++;
-
/*
* Validate version
*/
@@ -2130,14 +2181,12 @@ int generic_processor_info(int apicid, int version)
cpu, apicid);
version = 0x10;
}
- apic_version[apicid] = version;
- if (version != apic_version[boot_cpu_physical_apicid]) {
+ if (version != boot_cpu_apic_version) {
pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
- apic_version[boot_cpu_physical_apicid], cpu, version);
+ boot_cpu_apic_version, cpu, version);
}
- physid_set(apicid, phys_cpu_present_map);
if (apicid > max_physical_apicid)
max_physical_apicid = apicid;
@@ -2150,11 +2199,23 @@ int generic_processor_info(int apicid, int version)
apic->x86_32_early_logical_apicid(cpu);
#endif
set_cpu_possible(cpu, true);
- set_cpu_present(cpu, true);
+
+ if (enabled) {
+ num_processors++;
+ physid_set(apicid, phys_cpu_present_map);
+ set_cpu_present(cpu, true);
+ } else {
+ disabled_cpus++;
+ }
return cpu;
}
+int generic_processor_info(int apicid, int version)
+{
+ return __generic_processor_info(apicid, version, true);
+}
+
int hard_smp_processor_id(void)
{
return read_apic_id();
@@ -2277,7 +2338,7 @@ int __init APIC_init_uniprocessor(void)
* Complain if the BIOS pretends there is one.
*/
if (!boot_cpu_has(X86_FEATURE_APIC) &&
- APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ APIC_INTEGRATED(boot_cpu_apic_version)) {
pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
boot_cpu_physical_apicid);
return -1;
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5b2ae106bd4a..a4d7ff20ed22 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -25,7 +25,7 @@
static struct apic apic_physflat;
static struct apic apic_flat;
-struct apic __read_mostly *apic = &apic_flat;
+struct apic *apic __ro_after_init = &apic_flat;
EXPORT_SYMBOL_GPL(apic);
static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -116,27 +116,17 @@ static void flat_send_IPI_all(int vector)
static unsigned int flat_get_apic_id(unsigned long x)
{
- unsigned int id;
-
- id = (((x)>>24) & 0xFFu);
-
- return id;
+ return (x >> 24) & 0xFF;
}
static unsigned long set_apic_id(unsigned int id)
{
- unsigned long x;
-
- x = ((id & 0xFFu)<<24);
- return x;
+ return (id & 0xFF) << 24;
}
static unsigned int read_xapic_id(void)
{
- unsigned int id;
-
- id = flat_get_apic_id(apic_read(APIC_ID));
- return id;
+ return flat_get_apic_id(apic_read(APIC_ID));
}
static int flat_apic_id_registered(void)
@@ -154,7 +144,7 @@ static int flat_probe(void)
return 1;
}
-static struct apic apic_flat = {
+static struct apic apic_flat __ro_after_init = {
.name = "flat",
.probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -248,7 +238,7 @@ static int physflat_probe(void)
return 0;
}
-static struct apic apic_physflat = {
+static struct apic apic_physflat __ro_after_init = {
.name = "physical flat",
.probe = physflat_probe,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index c05688b2deff..b109e4389c92 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -108,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v)
WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
}
-struct apic apic_noop = {
+struct apic apic_noop __ro_after_init = {
.name = "noop",
.probe = noop_probe,
.acpi_madt_oem_check = NULL,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 714d4fda0d52..e08fe2c8dd8c 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -40,10 +40,7 @@ static unsigned int numachip1_get_apic_id(unsigned long x)
static unsigned long numachip1_set_apic_id(unsigned int id)
{
- unsigned long x;
-
- x = ((id & 0xffU) << 24);
- return x;
+ return (id & 0xff) << 24;
}
static unsigned int numachip2_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 06dbaa458bfe..56012010332c 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -142,7 +142,7 @@ static int probe_bigsmp(void)
return dmi_bigsmp;
}
-static struct apic apic_bigsmp = {
+static struct apic apic_bigsmp __ro_after_init = {
.name = "bigsmp",
.probe = probe_bigsmp,
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index f29501e1a5c1..c73c9fb281e1 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -26,32 +26,32 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
}
#endif
-#ifdef arch_trigger_all_cpu_backtrace
+#ifdef arch_trigger_cpumask_backtrace
static void nmi_raise_cpu_backtrace(cpumask_t *mask)
{
apic->send_IPI_mask(mask, NMI_VECTOR);
}
-void arch_trigger_all_cpu_backtrace(bool include_self)
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
{
- nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
+ nmi_trigger_cpumask_backtrace(mask, exclude_self,
+ nmi_raise_cpu_backtrace);
}
-static int
-arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
+static int nmi_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
if (nmi_cpu_backtrace(regs))
return NMI_HANDLED;
return NMI_DONE;
}
-NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
+NOKPROBE_SYMBOL(nmi_cpu_backtrace_handler);
-static int __init register_trigger_all_cpu_backtrace(void)
+static int __init register_nmi_cpu_backtrace_handler(void)
{
- register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler,
+ register_nmi_handler(NMI_LOCAL, nmi_cpu_backtrace_handler,
0, "arch_bt");
return 0;
}
-early_initcall(register_trigger_all_cpu_backtrace);
+early_initcall(register_nmi_cpu_backtrace_handler);
#endif
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7491f417a8e4..48e6d84f173e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1593,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void)
* no meaning without the serial APIC bus.
*/
if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ || APIC_XAPIC(boot_cpu_apic_version))
return;
setup_ioapic_ids_from_mpc_nocheck();
}
@@ -2423,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id)
static u8 io_apic_unique_id(int idx, u8 id)
{
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ !APIC_XAPIC(boot_cpu_apic_version))
return io_apic_get_unique_id(idx, id);
else
return id;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index ade25320df96..015bbf30e3e3 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
}
-static struct irq_chip hpet_msi_controller = {
+static struct irq_chip hpet_msi_controller __ro_after_init = {
.name = "HPET-MSI",
.irq_unmask = hpet_msi_unmask,
.irq_mask = hpet_msi_mask,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 7c43e716c158..c48264e202fd 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -72,7 +72,7 @@ static int probe_default(void)
return 1;
}
-static struct apic apic_default = {
+static struct apic apic_default __ro_after_init = {
.name = "default",
.probe = probe_default,
@@ -126,7 +126,7 @@ static struct apic apic_default = {
apic_driver(apic_default);
-struct apic *apic = &apic_default;
+struct apic *apic __ro_after_init = &apic_default;
EXPORT_SYMBOL_GPL(apic);
static int cmdline_apic __initdata;
@@ -152,7 +152,7 @@ early_param("apic", parse_apic);
void __init default_setup_apic_routing(void)
{
- int version = apic_version[boot_cpu_physical_apicid];
+ int version = boot_cpu_apic_version;
if (num_possible_cpus() > 8) {
switch (boot_cpu_data.x86_vendor) {
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 6066d945c40e..5d30c5e42bb1 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -661,11 +661,28 @@ void irq_complete_move(struct irq_cfg *cfg)
*/
void irq_force_complete_move(struct irq_desc *desc)
{
- struct irq_data *irqdata = irq_desc_get_irq_data(desc);
- struct apic_chip_data *data = apic_chip_data(irqdata);
- struct irq_cfg *cfg = data ? &data->cfg : NULL;
+ struct irq_data *irqdata;
+ struct apic_chip_data *data;
+ struct irq_cfg *cfg;
unsigned int cpu;
+ /*
+ * The function is called for all descriptors regardless of which
+ * irqdomain they belong to. For example if an IRQ is provided by
+ * an irq_chip as part of a GPIO driver, the chip data for that
+ * descriptor is specific to the irq_chip in question.
+ *
+ * Check first that the chip_data is what we expect
+ * (apic_chip_data) before touching it any further.
+ */
+ irqdata = irq_domain_get_irq_data(x86_vector_domain,
+ irq_desc_get_irq(desc));
+ if (!irqdata)
+ return;
+
+ data = apic_chip_data(irqdata);
+ cfg = data ? &data->cfg : NULL;
+
if (!cfg)
return;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 54f35d988025..200af5ae9662 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -227,7 +227,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
}
-static struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
.probe = x2apic_cluster_probe,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 4f13f54f1b1f..ff111f05a314 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -98,7 +98,7 @@ static int x2apic_phys_probe(void)
return apic == &apic_x2apic_phys;
}
-static struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index cb0673c1e940..aeef53ce93e1 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -533,11 +533,8 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
static unsigned long set_apic_id(unsigned int id)
{
- unsigned long x;
-
- /* maskout x2apic_extra_bits ? */
- x = id;
- return x;
+ /* CHECKME: Do we need to mask out the xapic extra bits? */
+ return id;
}
static unsigned int uv_read_apic_id(void)
@@ -560,7 +557,7 @@ static int uv_probe(void)
return apic == &apic_x2apic_uv_x;
}
-static struct apic __refdata apic_x2apic_uv_x = {
+static struct apic apic_x2apic_uv_x __ro_after_init = {
.name = "UV large system",
.probe = uv_probe,
@@ -927,7 +924,7 @@ static void uv_heartbeat(unsigned long ignored)
mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
}
-static void uv_heartbeat_enable(int cpu)
+static int uv_heartbeat_enable(unsigned int cpu)
{
while (!uv_cpu_scir_info(cpu)->enabled) {
struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer;
@@ -941,43 +938,24 @@ static void uv_heartbeat_enable(int cpu)
/* also ensure that boot cpu is enabled */
cpu = 0;
}
+ return 0;
}
#ifdef CONFIG_HOTPLUG_CPU
-static void uv_heartbeat_disable(int cpu)
+static int uv_heartbeat_disable(unsigned int cpu)
{
if (uv_cpu_scir_info(cpu)->enabled) {
uv_cpu_scir_info(cpu)->enabled = 0;
del_timer(&uv_cpu_scir_info(cpu)->timer);
}
uv_set_cpu_scir_bits(cpu, 0xff);
-}
-
-/*
- * cpu hotplug notifier
- */
-static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- uv_heartbeat_enable(cpu);
- break;
- case CPU_DOWN_PREPARE:
- uv_heartbeat_disable(cpu);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+ return 0;
}
static __init void uv_scir_register_cpu_notifier(void)
{
- hotcpu_notifier(uv_scir_cpu_notify, 0);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/x2apic-uvx:online",
+ uv_heartbeat_enable, uv_heartbeat_disable);
}
#else /* !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 2bd5c6ff7ee7..c62e015b126c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -29,10 +29,13 @@
void common(void) {
BLANK();
- OFFSET(TI_flags, thread_info, flags);
- OFFSET(TI_status, thread_info, status);
+ OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_CC_STACKPROTECTOR
+ OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
BLANK();
+ OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ecdc1d217dc0..880aa093268d 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -57,6 +57,11 @@ void foo(void)
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+#ifdef CONFIG_CC_STACKPROTECTOR
+ BLANK();
+ OFFSET(stack_canary_offset, stack_canary, canary);
+#endif
+
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index d875f97d4e0b..210927ee2e74 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -56,6 +56,11 @@ int main(void)
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
BLANK();
+#ifdef CONFIG_CC_STACKPROTECTOR
+ DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+ BLANK();
+#endif
+
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 809eda03c527..9bd910a7dd0a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
identify_cpu_without_cpuid(c);
/* cyrix could have cpuid enabled via c_identify()*/
- if (!have_cpuid_p())
- return;
-
- cpu_detect(c);
- get_cpu_vendor(c);
- get_cpu_cap(c);
+ if (have_cpuid_p()) {
+ cpu_detect(c);
+ get_cpu_vendor(c);
+ get_cpu_cap(c);
- if (this_cpu->c_early_init)
- this_cpu->c_early_init(c);
+ if (this_cpu->c_early_init)
+ this_cpu->c_early_init(c);
- c->cpu_index = 0;
- filter_cpuid_features(c, false);
+ c->cpu_index = 0;
+ filter_cpuid_features(c, false);
- if (this_cpu->c_bsp_init)
- this_cpu->c_bsp_init(c);
+ if (this_cpu->c_bsp_init)
+ this_cpu->c_bsp_init(c);
+ }
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
fpu__init_system(c);
@@ -1265,9 +1264,14 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
-struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
-struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
- (unsigned long) debug_idt_table };
+struct desc_ptr idt_descr __ro_after_init = {
+ .size = NR_VECTORS * 16 - 1,
+ .address = (unsigned long) idt_table,
+};
+const struct desc_ptr debug_idt_descr = {
+ .size = NR_VECTORS * 16 - 1,
+ .address = (unsigned long) debug_idt_table,
+};
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
@@ -1281,7 +1285,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(char *, irq_stack_ptr) =
- init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+ init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
@@ -1305,11 +1309,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
- /*
- * LSTAR and STAR live in a bit strange symbiosis.
- * They both write to the same internal register. STAR allows to
- * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
- */
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 27e46658ebe3..35691a6b0d32 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -86,3 +86,14 @@ bool __init hypervisor_x2apic_available(void)
x86_hyper->x2apic_available &&
x86_hyper->x2apic_available();
}
+
+void hypervisor_pin_vcpu(int cpu)
+{
+ if (!x86_hyper)
+ return;
+
+ if (x86_hyper->pin_vcpu)
+ x86_hyper->pin_vcpu(cpu);
+ else
+ WARN_ONCE(1, "vcpu pinning requested but not supported!\n");
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 79d8ec849468..a7fdf453d895 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -41,6 +41,7 @@
#include <linux/debugfs.h>
#include <linux/irq_work.h>
#include <linux/export.h>
+#include <linux/jump_label.h>
#include <asm/processor.h>
#include <asm/traps.h>
@@ -292,6 +293,13 @@ static void print_mce(struct mce *m)
if (m->misc)
pr_cont("MISC %llx ", m->misc);
+ if (mce_flags.smca) {
+ if (m->synd)
+ pr_cont("SYND %llx ", m->synd);
+ if (m->ipid)
+ pr_cont("IPID %llx ", m->ipid);
+ }
+
pr_cont("\n");
/*
* Note this output is parsed by external tools and old fields
@@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i)
{
if (m->status & MCI_STATUS_MISCV)
m->misc = mce_rdmsrl(msr_ops.misc(i));
+
if (m->status & MCI_STATUS_ADDRV) {
m->addr = mce_rdmsrl(msr_ops.addr(i));
@@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i)
m->addr >>= shift;
m->addr <<= shift;
}
+
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m->addr >> 56) & 0x3f;
+
+ m->addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+
+ if (m->status & MCI_STATUS_SYNDV)
+ m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
}
}
@@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model == 45)
quirk_no_way_out = quirk_sandybridge_ifu;
- /*
- * MCG_CAP.MCG_SER_P is necessary but not sufficient to know
- * whether this processor will actually generate recoverable
- * machine checks. Check to see if this is an E7 model Xeon.
- * We can't do a model number check because E5 and E7 use the
- * same model number. E5 doesn't support recovery, E7 does.
- */
- if (mca_cfg.recovery || (mca_cfg.ser &&
- !strncmp(c->x86_model_id,
- "Intel(R) Xeon(R) CPU E7-", 24)))
- set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY);
}
if (cfg->monarch_timeout < 0)
cfg->monarch_timeout = 0;
@@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank)
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
* mce=bios_cmci_threshold Don't program the CMCI threshold
+ * mce=recovery force enable memcpy_mcsafe()
*/
static int __init mcheck_enable(char *str)
{
@@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void)
static int __init mcheck_debugfs_init(void) { return -EINVAL; }
#endif
+DEFINE_STATIC_KEY_FALSE(mcsafe_key);
+EXPORT_SYMBOL_GPL(mcsafe_key);
+
static int __init mcheck_late_init(void)
{
+ if (mca_cfg.recovery)
+ static_branch_inc(&mcsafe_key);
+
mcheck_debugfs_init();
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 7b7f3be783d4..9b5403462936 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/smp.h>
+#include <linux/string.h>
#include <asm/amd_nb.h>
#include <asm/apic.h>
@@ -63,34 +64,71 @@ static const char * const th_names[] = {
"execution_unit",
};
-/* Define HWID to IP type mappings for Scalable MCA */
-struct amd_hwid amd_hwids[] = {
- [SMCA_F17H_CORE] = { "f17h_core", 0xB0 },
- [SMCA_DF] = { "data_fabric", 0x2E },
- [SMCA_UMC] = { "umc", 0x96 },
- [SMCA_PB] = { "param_block", 0x5 },
- [SMCA_PSP] = { "psp", 0xFF },
- [SMCA_SMU] = { "smu", 0x1 },
+static const char * const smca_umc_block_names[] = {
+ "dram_ecc",
+ "misc_umc"
};
-EXPORT_SYMBOL_GPL(amd_hwids);
-
-const char * const amd_core_mcablock_names[] = {
- [SMCA_LS] = "load_store",
- [SMCA_IF] = "insn_fetch",
- [SMCA_L2_CACHE] = "l2_cache",
- [SMCA_DE] = "decode_unit",
- [RES] = "",
- [SMCA_EX] = "execution_unit",
- [SMCA_FP] = "floating_point",
- [SMCA_L3_CACHE] = "l3_cache",
+
+struct smca_bank_name smca_bank_names[] = {
+ [SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
+ [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
+ [SMCA_DE] = { "decode_unit", "Decode Unit" },
+ [SMCA_EX] = { "execution_unit", "Execution Unit" },
+ [SMCA_FP] = { "floating_point", "Floating Point Unit" },
+ [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
+ [SMCA_CS] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
+ [SMCA_UMC] = { "umc", "Unified Memory Controller" },
+ [SMCA_PB] = { "param_block", "Parameter Block" },
+ [SMCA_PSP] = { "psp", "Platform Security Processor" },
+ [SMCA_SMU] = { "smu", "System Management Unit" },
};
-EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+EXPORT_SYMBOL_GPL(smca_bank_names);
+
+static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+ /* { bank_type, hwid_mcatype, xec_bitmap } */
+
+ /* ZN Core (HWID=0xB0) MCA types */
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+ { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
+ { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
+ { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
+ /* HWID 0xB0 MCATYPE 0x4 is Reserved */
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+ { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
+ { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
+
+ /* Data Fabric MCA types */
+ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF },
+
+ /* Unified Memory Controller MCA type */
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F },
+
+ /* Parameter Block MCA type */
+ { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
+
+ /* Platform Security Processor MCA type */
+ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
-const char * const amd_df_mcablock_names[] = {
- [SMCA_CS] = "coherent_slave",
- [SMCA_PIE] = "pie",
+ /* System Management Unit MCA type */
+ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
};
-EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
+struct smca_bank_info smca_banks[MAX_NR_BANKS];
+EXPORT_SYMBOL_GPL(smca_banks);
+
+/*
+ * In SMCA enabled processors, we can have multiple banks for a given IP type.
+ * So to define a unique name for each bank, we use a temp c-string to append
+ * the MCA_IPID[InstanceId] to type's name in get_name().
+ *
+ * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN
+ * is greater than 8 plus 1 (for underscore) plus length of longest type name.
+ */
+#define MAX_MCATYPE_NAME_LEN 30
+static char buf_mcatype[MAX_MCATYPE_NAME_LEN];
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */
@@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
* CPU Initialization
*/
+static void get_smca_bank_info(unsigned int bank)
+{
+ unsigned int i, hwid_mcatype, cpu = smp_processor_id();
+ struct smca_hwid_mcatype *type;
+ u32 high, instanceId;
+ u16 hwid, mcatype;
+
+ /* Collect bank_info using CPU 0 for now. */
+ if (cpu)
+ return;
+
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+ pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
+ return;
+ }
+
+ hwid = high & MCI_IPID_HWID;
+ mcatype = (high & MCI_IPID_MCATYPE) >> 16;
+ hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+
+ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
+ type = &smca_hwid_mcatypes[i];
+ if (hwid_mcatype == type->hwid_mcatype) {
+ smca_banks[bank].type = type;
+ smca_banks[bank].type_instance = instanceId;
+ break;
+ }
+ }
+}
+
struct thresh_restart {
struct threshold_block *b;
int reset;
@@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}
-static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high,
unsigned int bank, unsigned int block)
{
u32 addr = 0, offset = 0;
@@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high,
*/
u32 low, high;
- if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
return addr;
if (!(low & MCI_CONFIG_MCAX))
return addr;
- if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+ if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
(low & MASK_BLKPTR_LO))
addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
}
@@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
*/
smca_high &= ~BIT(2);
+ /*
+ * SMCA sets the Deferred Error Interrupt type per bank.
+ *
+ * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+ * if the DeferredIntType bit field is available.
+ *
+ * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+ * high portion of the MSR). OS should set this to 0x1 to enable
+ * APIC based interrupt. First, check that no interrupt has been
+ * set.
+ */
+ if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
+ smca_high |= BIT(5);
+
wrmsr(smca_addr, smca_low, smca_high);
}
@@ -421,12 +503,15 @@ out:
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
u32 low = 0, high = 0, address = 0;
- unsigned int bank, block;
+ unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;
for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (mce_flags.smca)
+ get_smca_bank_info(bank);
+
for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(address, low, high, bank, block);
+ address = get_block_address(cpu, address, low, high, bank, block);
if (!address)
break;
@@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
if (threshold_err)
m.misc = misc;
- if (m.status & MCI_STATUS_ADDRV)
+ if (m.status & MCI_STATUS_ADDRV) {
rdmsrl(msr_addr, m.addr);
+ /*
+ * Extract [55:<lsb>] where lsb is the least significant
+ * *valid* bit of the address bits.
+ */
+ if (mce_flags.smca) {
+ u8 lsb = (m.addr >> 56) & 0x3f;
+
+ m.addr &= GENMASK_ULL(55, lsb);
+ }
+ }
+
+ if (mce_flags.smca) {
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid);
+
+ if (m.status & MCI_STATUS_SYNDV)
+ rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd);
+ }
+
mce_log(&m);
wrmsrl(msr_status, 0);
@@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void)
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
- int cpu = smp_processor_id();
- unsigned int bank, block;
+ unsigned int bank, block, cpu = smp_processor_id();
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
- address = get_block_address(address, low, high, bank, block);
+ address = get_block_address(cpu, address, low, high, bank, block);
if (!address)
break;
@@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = {
.default_attrs = default_attrs,
};
+static const char *get_name(unsigned int bank, struct threshold_block *b)
+{
+ unsigned int bank_type;
+
+ if (!mce_flags.smca) {
+ if (b && bank == 4)
+ return bank4_names(b);
+
+ return th_names[bank];
+ }
+
+ if (!smca_banks[bank].type)
+ return NULL;
+
+ bank_type = smca_banks[bank].type->bank_type;
+
+ if (b && bank_type == SMCA_UMC) {
+ if (b->block < ARRAY_SIZE(smca_umc_block_names))
+ return smca_umc_block_names[b->block];
+ return NULL;
+ }
+
+ snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
+ "%s_%x", smca_bank_names[bank_type].name,
+ smca_banks[bank].type_instance);
+ return buf_mcatype;
+}
+
static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
unsigned int block, u32 address)
{
@@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
err = kobject_init_and_add(&b->kobj, &threshold_ktype,
per_cpu(threshold_banks, cpu)[bank]->kobj,
- (bank == 4 ? bank4_names(b) : th_names[bank]));
+ get_name(bank, b));
if (err)
goto out_free;
recurse:
- address = get_block_address(address, low, high, bank, ++block);
+ address = get_block_address(cpu, address, low, high, bank, ++block);
if (!address)
return 0;
@@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
struct device *dev = per_cpu(mce_device, cpu);
struct amd_northbridge *nb = NULL;
struct threshold_bank *b = NULL;
- const char *name = th_names[bank];
+ const char *name = get_name(bank, NULL);
int err = 0;
if (is_shared_bank(bank)) {
@@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
}
}
- err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank));
+ err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
if (!err)
goto out;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index df04b2d033f6..5ce5155f0695 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -558,55 +558,36 @@ static struct syscore_ops mc_syscore_ops = {
.resume = mc_bp_resume,
};
-static int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+static int mc_cpu_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
struct device *dev;
dev = get_cpu_device(cpu);
+ microcode_update_cpu(cpu);
+ pr_debug("CPU%d added\n", cpu);
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- microcode_update_cpu(cpu);
- pr_debug("CPU%d added\n", cpu);
- /*
- * "break" is missing on purpose here because we want to fall
- * through in order to create the sysfs group.
- */
-
- case CPU_DOWN_FAILED:
- if (sysfs_create_group(&dev->kobj, &mc_attr_group))
- pr_err("Failed to create group for CPU%d\n", cpu);
- break;
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
+ pr_err("Failed to create group for CPU%d\n", cpu);
+ return 0;
+}
- case CPU_DOWN_PREPARE:
- /* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&dev->kobj, &mc_attr_group);
- pr_debug("CPU%d removed\n", cpu);
- break;
+static int mc_cpu_down_prep(unsigned int cpu)
+{
+ struct device *dev;
+ dev = get_cpu_device(cpu);
+ /* Suspend is in progress, only remove the interface */
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
+ pr_debug("CPU%d removed\n", cpu);
/*
- * case CPU_DEAD:
- *
* When a CPU goes offline, don't free up or invalidate the copy of
* the microcode in kernel memory, so that we can reuse it when the
* CPU comes back online without unnecessarily requesting the userspace
* for it again.
*/
- }
-
- /* The CPU refused to come up during a system resume */
- if (action == CPU_UP_CANCELED_FROZEN)
- microcode_fini_cpu(cpu);
-
- return NOTIFY_OK;
+ return 0;
}
-static struct notifier_block mc_cpu_notifier = {
- .notifier_call = mc_cpu_callback,
-};
-
static struct attribute *cpu_root_microcode_attrs[] = {
&dev_attr_reload.attr,
NULL
@@ -665,7 +646,8 @@ int __init microcode_init(void)
goto out_ucode_group;
register_syscore_ops(&mc_syscore_ops);
- register_hotcpu_notifier(&mc_cpu_notifier);
+ cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
+ mc_cpu_online, mc_cpu_down_prep);
pr_info("Microcode Update Driver: v" MICROCODE_VERSION
" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 28f1b54b7fad..24e87e74990d 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
static bool mtrr_aps_delayed_init;
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
const struct mtrr_ops *mtrr_if;
static void set_mtrr(unsigned int reg, unsigned long base,
unsigned long size, mtrr_type type);
-void set_mtrr_ops(const struct mtrr_ops *ops)
+void __init set_mtrr_ops(const struct mtrr_ops *ops)
{
if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
mtrr_ops[ops->vendor] = ops;
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 6c7ced07d16d..ad8bd763efa5 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index,
bool get_mtrr_state(void);
void mtrr_bp_pat_init(void);
-extern void set_mtrr_ops(const struct mtrr_ops *ops);
+extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
extern u64 size_or_mask, size_and_mask;
extern const struct mtrr_ops *mtrr_if;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 8cb57df9398d..1db8dc490b66 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -32,6 +32,8 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
+ { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 },
+ { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1ff0598d309c..5130985b758b 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -27,6 +27,8 @@
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
+#include <asm/timer.h>
+#include <asm/apic.h>
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -82,10 +84,21 @@ static void __init vmware_platform_setup(void)
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
- if (ebx != UINT_MAX)
+ if (ebx != UINT_MAX) {
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
- else
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* Skip lapic calibration since we know the bus frequency. */
+ lapic_timer_frequency = ecx / HZ;
+ pr_info("Host bus clock speed read from hypervisor : %u Hz\n",
+ ecx);
+#endif
+ } else {
pr_warn("Failed to get TSC freq from the hypervisor\n");
+ }
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
}
/*
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 9616cf76940c..650830e39e3a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -133,15 +133,31 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
disable_local_APIC();
}
-static void kdump_nmi_shootdown_cpus(void)
+void kdump_nmi_shootdown_cpus(void)
{
nmi_shootdown_cpus(kdump_nmi_callback);
disable_local_APIC();
}
+/* Override the weak function in kernel/panic.c */
+void crash_smp_send_stop(void)
+{
+ static int cpus_stopped;
+
+ if (cpus_stopped)
+ return;
+
+ if (smp_ops.crash_stop_other_cpus)
+ smp_ops.crash_stop_other_cpus();
+ else
+ smp_send_stop();
+
+ cpus_stopped = 1;
+}
+
#else
-static void kdump_nmi_shootdown_cpus(void)
+void crash_smp_send_stop(void)
{
/* There are no cpus to shootdown */
}
@@ -160,7 +176,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
/* The kernel is broken so disable interrupts */
local_irq_disable();
- kdump_nmi_shootdown_cpus();
+ crash_smp_send_stop();
/*
* VMCLEAR VMCSs loaded on this cpu if needed.
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 92e8f0a7159c..9b7cf5c28f5f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -17,7 +17,7 @@
#include <linux/sysfs.h>
#include <asm/stacktrace.h>
-
+#include <asm/unwind.h>
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
@@ -25,11 +25,29 @@ unsigned int code_bytes = 64;
int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
+{
+ unsigned long *begin = task_stack_page(task);
+ unsigned long *end = task_stack_page(task) + THREAD_SIZE;
+
+ if (stack < begin || stack >= end)
+ return false;
+
+ info->type = STACK_TYPE_TASK;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = NULL;
+
+ return true;
+}
+
static void printk_stack_address(unsigned long address, int reliable,
- void *data)
+ char *log_lvl)
{
+ touch_nmi_watchdog();
printk("%s [<%p>] %s%pB\n",
- (char *)data, (void *)address, reliable ? "" : "? ",
+ log_lvl, (void *)address, reliable ? "" : "? ",
(void *)address);
}
@@ -38,176 +56,120 @@ void printk_address(unsigned long address)
pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct task_struct *task, int *graph)
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack, char *log_lvl)
{
- unsigned long ret_addr;
- int index;
-
- if (addr != (unsigned long)return_to_handler)
- return;
-
- index = task->curr_ret_stack;
-
- if (!task->ret_stack || index < *graph)
- return;
-
- index -= *graph;
- ret_addr = task->ret_stack[index].ret;
-
- ops->address(data, ret_addr, 1);
+ struct unwind_state state;
+ struct stack_info stack_info = {0};
+ unsigned long visit_mask = 0;
+ int graph_idx = 0;
- (*graph)++;
-}
-#else
-static inline void
-print_ftrace_graph_addr(unsigned long addr, void *data,
- const struct stacktrace_ops *ops,
- struct task_struct *task, int *graph)
-{ }
-#endif
-
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-static inline int valid_stack_ptr(struct task_struct *task,
- void *p, unsigned int size, void *end)
-{
- void *t = task_stack_page(task);
- if (end) {
- if (p < end && p >= (end-THREAD_SIZE))
- return 1;
- else
- return 0;
- }
- return p >= t && p < t + THREAD_SIZE - size;
-}
+ printk("%sCall Trace:\n", log_lvl);
-unsigned long
-print_context_stack(struct task_struct *task,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
+ unwind_start(&state, task, regs, stack);
/*
- * If we overflowed the stack into a guard page, jump back to the
- * bottom of the usable stack.
+ * Iterate through the stacks, starting with the current stack pointer.
+ * Each stack has a pointer to the next one.
+ *
+ * x86-64 can have several stacks:
+ * - task stack
+ * - interrupt stack
+ * - HW exception stacks (double fault, nmi, debug, mce)
+ *
+ * x86-32 can have up to three stacks:
+ * - task stack
+ * - softirq stack
+ * - hardirq stack
*/
- if ((unsigned long)task_stack_page(task) - (unsigned long)stack <
- PAGE_SIZE)
- stack = (unsigned long *)task_stack_page(task);
-
- while (valid_stack_ptr(task, stack, sizeof(*stack), end)) {
- unsigned long addr;
-
- addr = *stack;
- if (__kernel_text_address(addr)) {
- if ((unsigned long) stack == bp + sizeof(long)) {
- ops->address(data, addr, 1);
- frame = frame->next_frame;
- bp = (unsigned long) frame;
- } else {
- ops->address(data, addr, 0);
- }
- print_ftrace_graph_addr(addr, data, ops, task, graph);
- }
- stack++;
- }
- return bp;
-}
-EXPORT_SYMBOL_GPL(print_context_stack);
-
-unsigned long
-print_context_stack_bp(struct task_struct *task,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data,
- unsigned long *end, int *graph)
-{
- struct stack_frame *frame = (struct stack_frame *)bp;
- unsigned long *ret_addr = &frame->return_address;
+ for (; stack; stack = stack_info.next_sp) {
+ const char *str_begin, *str_end;
- while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) {
- unsigned long addr = *ret_addr;
+ /*
+ * If we overflowed the task stack into a guard page, jump back
+ * to the bottom of the usable stack.
+ */
+ if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
+ stack = task_stack_page(task);
- if (!__kernel_text_address(addr))
+ if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
- if (ops->address(data, addr, 1))
- break;
- frame = frame->next_frame;
- ret_addr = &frame->return_address;
- print_ftrace_graph_addr(addr, data, ops, task, graph);
- }
-
- return (unsigned long)frame;
-}
-EXPORT_SYMBOL_GPL(print_context_stack_bp);
-
-static int print_trace_stack(void *data, char *name)
-{
- printk("%s <%s> ", (char *)data, name);
- return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static int print_trace_address(void *data, unsigned long addr, int reliable)
-{
- touch_nmi_watchdog();
- printk_stack_address(addr, reliable, data);
- return 0;
-}
-
-static const struct stacktrace_ops print_trace_ops = {
- .stack = print_trace_stack,
- .address = print_trace_address,
- .walk_stack = print_context_stack,
-};
-
-void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
-{
- printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
+ stack_type_str(stack_info.type, &str_begin, &str_end);
+ if (str_begin)
+ printk("%s <%s> ", log_lvl, str_begin);
+
+ /*
+ * Scan the stack, printing any text addresses we find. At the
+ * same time, follow proper stack frames with the unwinder.
+ *
+ * Addresses found during the scan which are not reported by
+ * the unwinder are considered to be additional clues which are
+ * sometimes useful for debugging and are prefixed with '?'.
+ * This also serves as a failsafe option in case the unwinder
+ * goes off in the weeds.
+ */
+ for (; stack < stack_info.end; stack++) {
+ unsigned long real_addr;
+ int reliable = 0;
+ unsigned long addr = *stack;
+ unsigned long *ret_addr_p =
+ unwind_get_return_address_ptr(&state);
+
+ if (!__kernel_text_address(addr))
+ continue;
+
+ if (stack == ret_addr_p)
+ reliable = 1;
+
+ /*
+ * When function graph tracing is enabled for a
+ * function, its return address on the stack is
+ * replaced with the address of an ftrace handler
+ * (return_to_handler). In that case, before printing
+ * the "real" address, we want to print the handler
+ * address as an "unreliable" hint that function graph
+ * tracing was involved.
+ */
+ real_addr = ftrace_graph_ret_addr(task, &graph_idx,
+ addr, stack);
+ if (real_addr != addr)
+ printk_stack_address(addr, 0, log_lvl);
+ printk_stack_address(real_addr, reliable, log_lvl);
+
+ if (!reliable)
+ continue;
+
+ /*
+ * Get the next frame from the unwinder. No need to
+ * check for an error: if anything goes wrong, the rest
+ * of the addresses will just be printed as unreliable.
+ */
+ unwind_next_frame(&state);
+ }
-void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
-{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ if (str_end)
+ printk("%s <%s> ", log_lvl, str_end);
+ }
}
void show_stack(struct task_struct *task, unsigned long *sp)
{
- unsigned long bp = 0;
- unsigned long stack;
+ task = task ? : current;
/*
* Stack frames below this one aren't interesting. Don't show them
* if we're printing for %current.
*/
- if (!sp && (!task || task == current)) {
- sp = &stack;
- bp = stack_frame(current, NULL);
- }
+ if (!sp && task == current)
+ sp = get_stack_pointer(current, NULL);
- show_stack_log_lvl(task, NULL, sp, bp, "");
+ show_stack_log_lvl(task, NULL, sp, "");
}
void show_stack_regs(struct pt_regs *regs)
{
- show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, "");
+ show_stack_log_lvl(current, regs, NULL, "");
}
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 09675712eba8..06eb322b5f9f 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,93 +16,121 @@
#include <asm/stacktrace.h>
-static void *is_irq_stack(void *p, void *irq)
+void stack_type_str(enum stack_type type, const char **begin, const char **end)
{
- if (p < irq || p >= (irq + THREAD_SIZE))
- return NULL;
- return irq + THREAD_SIZE;
+ switch (type) {
+ case STACK_TYPE_IRQ:
+ case STACK_TYPE_SOFTIRQ:
+ *begin = "IRQ";
+ *end = "EOI";
+ break;
+ default:
+ *begin = NULL;
+ *end = NULL;
+ }
}
-
-static void *is_hardirq_stack(unsigned long *stack, int cpu)
+static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{
- void *irq = per_cpu(hardirq_stack, cpu);
+ unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- return is_irq_stack(stack, irq);
-}
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
-static void *is_softirq_stack(unsigned long *stack, int cpu)
-{
- void *irq = per_cpu(softirq_stack, cpu);
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
- return is_irq_stack(stack, irq);
+ /*
+ * See irq_32.c -- the next stack pointer is stored at the beginning of
+ * the stack.
+ */
+ info->next_sp = (unsigned long *)*begin;
+
+ return true;
}
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{
- const unsigned cpu = get_cpu();
- int graph = 0;
- u32 *prev_esp;
+ unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack);
+ unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
- if (!task)
- task = current;
+ /*
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
+ */
+ if (stack < begin || stack > end)
+ return false;
- if (!stack) {
- unsigned long dummy;
+ info->type = STACK_TYPE_SOFTIRQ;
+ info->begin = begin;
+ info->end = end;
- stack = &dummy;
- if (task != current)
- stack = (unsigned long *)task->thread.sp;
- }
+ /*
+ * The next stack pointer is stored at the beginning of the stack.
+ * See irq_32.c.
+ */
+ info->next_sp = (unsigned long *)*begin;
- if (!bp)
- bp = stack_frame(task, regs);
+ return true;
+}
- for (;;) {
- void *end_stack;
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
- end_stack = is_hardirq_stack(stack, cpu);
- if (!end_stack)
- end_stack = is_softirq_stack(stack, cpu);
+ task = task ? : current;
- bp = ops->walk_stack(task, stack, bp, ops, data,
- end_stack, &graph);
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
- /* Stop if not on irq stack */
- if (!end_stack)
- break;
+ if (task != current)
+ goto unknown;
- /* The previous esp is saved on the bottom of the stack */
- prev_esp = (u32 *)(end_stack - THREAD_SIZE);
- stack = (unsigned long *)*prev_esp;
- if (!stack)
- break;
+ if (in_hardirq_stack(stack, info))
+ goto recursion_check;
- if (ops->stack(data, "IRQ") < 0)
- break;
- touch_nmi_watchdog();
+ if (in_softirq_stack(stack, info))
+ goto recursion_check;
+
+ goto unknown;
+
+recursion_check:
+ /*
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
+ */
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type))
+ goto unknown;
+ *visit_mask |= 1UL << info->type;
}
- put_cpu();
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, char *log_lvl)
{
unsigned long *stack;
int i;
- if (sp == NULL) {
- if (regs)
- sp = (unsigned long *)regs->sp;
- else if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ if (!try_get_task_stack(task))
+ return;
+
+ sp = sp ? : get_stack_pointer(task, regs);
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
@@ -117,7 +145,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
touch_nmi_watchdog();
}
pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
+
+ put_task_stack(task);
}
@@ -139,7 +169,7 @@ void show_regs(struct pt_regs *regs)
u8 *ip;
pr_emerg("Stack:\n");
- show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
+ show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
pr_emerg("Code:");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 9ee4520ce83c..36cf1a498227 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,261 +16,145 @@
#include <asm/stacktrace.h>
+static char *exception_stack_names[N_EXCEPTION_STACKS] = {
+ [ DOUBLEFAULT_STACK-1 ] = "#DF",
+ [ NMI_STACK-1 ] = "NMI",
+ [ DEBUG_STACK-1 ] = "#DB",
+ [ MCE_STACK-1 ] = "#MC",
+};
-#define N_EXCEPTION_STACKS_END \
- (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
-
-static char x86_stack_ids[][8] = {
- [ DEBUG_STACK-1 ] = "#DB",
- [ NMI_STACK-1 ] = "NMI",
- [ DOUBLEFAULT_STACK-1 ] = "#DF",
- [ MCE_STACK-1 ] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- [ N_EXCEPTION_STACKS ...
- N_EXCEPTION_STACKS_END ] = "#DB[?]"
-#endif
+static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+ [DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, char **idp)
+void stack_type_str(enum stack_type type, const char **begin, const char **end)
{
- unsigned k;
-
- /*
- * Iterate over all exception stacks, and figure out whether
- * 'stack' is in one of them:
- */
- for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end = per_cpu(orig_ist, cpu).ist[k];
- /*
- * Is 'stack' above this exception frame's end?
- * If yes then skip to the next frame.
- */
- if (stack >= end)
- continue;
- /*
- * Is 'stack' above this exception frame's start address?
- * If yes then we found the right frame.
- */
- if (stack >= end - EXCEPTION_STKSZ) {
- /*
- * Make sure we only iterate through an exception
- * stack once. If it comes up for the second time
- * then there's something wrong going on - just
- * break out and return NULL:
- */
- if (*usedp & (1U << k))
- break;
- *usedp |= 1U << k;
- *idp = x86_stack_ids[k];
- return (unsigned long *)end;
- }
- /*
- * If this is a debug stack, and if it has a larger size than
- * the usual exception stacks, then 'stack' might still
- * be within the lower portion of the debug stack:
- */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
- unsigned j = N_EXCEPTION_STACKS - 1;
-
- /*
- * Black magic. A large debug stack is composed of
- * multiple exception stack entries, which we
- * iterate through now. Dont look:
- */
- do {
- ++j;
- end -= EXCEPTION_STKSZ;
- x86_stack_ids[j][4] = '1' +
- (j - N_EXCEPTION_STACKS);
- } while (stack < end - EXCEPTION_STKSZ);
- if (*usedp & (1U << j))
- break;
- *usedp |= 1U << j;
- *idp = x86_stack_ids[j];
- return (unsigned long *)end;
- }
-#endif
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+
+ switch (type) {
+ case STACK_TYPE_IRQ:
+ *begin = "IRQ";
+ *end = "EOI";
+ break;
+ case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST:
+ *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION];
+ *end = "EOE";
+ break;
+ default:
+ *begin = NULL;
+ *end = NULL;
}
- return NULL;
}
-static inline int
-in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
- unsigned long *irq_stack_end)
+static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
- return (stack >= irq_stack && stack < irq_stack_end);
-}
-
-static const unsigned long irq_stack_size =
- (IRQ_STACK_SIZE - 64) / sizeof(unsigned long);
-
-enum stack_type {
- STACK_IS_UNKNOWN,
- STACK_IS_NORMAL,
- STACK_IS_EXCEPTION,
- STACK_IS_IRQ,
-};
-
-static enum stack_type
-analyze_stack(int cpu, struct task_struct *task, unsigned long *stack,
- unsigned long **stack_end, unsigned long *irq_stack,
- unsigned *used, char **id)
-{
- unsigned long addr;
+ unsigned long *begin, *end;
+ struct pt_regs *regs;
+ unsigned k;
- addr = ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- if ((unsigned long)task_stack_page(task) == addr)
- return STACK_IS_NORMAL;
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
- *stack_end = in_exception_stack(cpu, (unsigned long)stack,
- used, id);
- if (*stack_end)
- return STACK_IS_EXCEPTION;
+ for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+ end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k];
+ begin = end - (exception_stack_sizes[k] / sizeof(long));
+ regs = (struct pt_regs *)end - 1;
- if (!irq_stack)
- return STACK_IS_NORMAL;
+ if (stack < begin || stack >= end)
+ continue;
- *stack_end = irq_stack;
- irq_stack = irq_stack - irq_stack_size;
+ info->type = STACK_TYPE_EXCEPTION + k;
+ info->begin = begin;
+ info->end = end;
+ info->next_sp = (unsigned long *)regs->sp;
- if (in_irq_stack(stack, irq_stack, *stack_end))
- return STACK_IS_IRQ;
+ return true;
+ }
- return STACK_IS_UNKNOWN;
+ return false;
}
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
- const unsigned cpu = get_cpu();
- unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
- unsigned long dummy;
- unsigned used = 0;
- int graph = 0;
- int done = 0;
-
- if (!task)
- task = current;
-
- if (!stack) {
- if (regs)
- stack = (unsigned long *)regs->sp;
- else if (task != current)
- stack = (unsigned long *)task->thread.sp;
- else
- stack = &dummy;
- }
+ unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
- if (!bp)
- bp = stack_frame(task, regs);
/*
- * Print function call entries in all stacks, starting at the
- * current stack address. If the stacks consist of nested
- * exceptions
+ * This is a software stack, so 'end' can be a valid stack pointer.
+ * It just means the stack is empty.
*/
- while (!done) {
- unsigned long *stack_end;
- enum stack_type stype;
- char *id;
+ if (stack < begin || stack > end)
+ return false;
- stype = analyze_stack(cpu, task, stack, &stack_end,
- irq_stack, &used, &id);
+ info->type = STACK_TYPE_IRQ;
+ info->begin = begin;
+ info->end = end;
- /* Default finish unless specified to continue */
- done = 1;
+ /*
+ * The next stack pointer is the first thing pushed by the entry code
+ * after switching to the irq stack.
+ */
+ info->next_sp = (unsigned long *)*(end - 1);
- switch (stype) {
+ return true;
+}
- /* Break out early if we are on the thread stack */
- case STACK_IS_NORMAL:
- break;
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ if (!stack)
+ goto unknown;
- case STACK_IS_EXCEPTION:
+ task = task ? : current;
- if (ops->stack(data, id) < 0)
- break;
+ if (in_task_stack(stack, task, info))
+ goto recursion_check;
- bp = ops->walk_stack(task, stack, bp, ops,
- data, stack_end, &graph);
- ops->stack(data, "<EOE>");
- /*
- * We link to the next stack via the
- * second-to-last pointer (index -2 to end) in the
- * exception stack:
- */
- stack = (unsigned long *) stack_end[-2];
- done = 0;
- break;
+ if (task != current)
+ goto unknown;
- case STACK_IS_IRQ:
+ if (in_exception_stack(stack, info))
+ goto recursion_check;
- if (ops->stack(data, "IRQ") < 0)
- break;
- bp = ops->walk_stack(task, stack, bp,
- ops, data, stack_end, &graph);
- /*
- * We link to the next stack (which would be
- * the process stack normally) the last
- * pointer (index -1 to end) in the IRQ stack:
- */
- stack = (unsigned long *) (stack_end[-1]);
- irq_stack = NULL;
- ops->stack(data, "EOI");
- done = 0;
- break;
+ if (in_irq_stack(stack, info))
+ goto recursion_check;
- case STACK_IS_UNKNOWN:
- ops->stack(data, "UNK");
- break;
- }
- }
+ goto unknown;
+recursion_check:
/*
- * This handles the process stack:
+ * Make sure we don't iterate through any given stack more than once.
+ * If it comes up a second time then there's something wrong going on:
+ * just break out and report an unknown stack type.
*/
- bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph);
- put_cpu();
+ if (visit_mask) {
+ if (*visit_mask & (1UL << info->type))
+ goto unknown;
+ *visit_mask |= 1UL << info->type;
+ }
+
+ return 0;
+
+unknown:
+ info->type = STACK_TYPE_UNKNOWN;
+ return -EINVAL;
}
-EXPORT_SYMBOL(dump_trace);
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, char *log_lvl)
{
unsigned long *irq_stack_end;
unsigned long *irq_stack;
unsigned long *stack;
- int cpu;
int i;
- preempt_disable();
- cpu = smp_processor_id();
+ if (!try_get_task_stack(task))
+ return;
- irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
- irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
+ irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+ irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
- /*
- * Debugging aid: "show_stack(NULL, NULL);" prints the
- * back trace for this cpu:
- */
- if (sp == NULL) {
- if (regs)
- sp = (unsigned long *)regs->sp;
- else if (task)
- sp = (unsigned long *)task->thread.sp;
- else
- sp = (unsigned long *)&sp;
- }
+ sp = sp ? : get_stack_pointer(task, regs);
stack = sp;
for (i = 0; i < kstack_depth_to_print; i++) {
@@ -299,18 +183,17 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
stack++;
touch_nmi_watchdog();
}
- preempt_enable();
pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
+
+ put_task_stack(task);
}
void show_regs(struct pt_regs *regs)
{
int i;
- unsigned long sp;
- sp = regs->sp;
show_regs_print_info(KERN_DEFAULT);
__show_regs(regs, 1);
@@ -325,8 +208,7 @@ void show_regs(struct pt_regs *regs)
u8 *ip;
printk(KERN_DEFAULT "Stack:\n");
- show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- 0, KERN_DEFAULT);
+ show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
printk(KERN_DEFAULT "Code: ");
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 621b501f8935..90e8dde3ec26 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,8 +40,10 @@
* user can e.g. boot the original kernel with mem=1G while still booting the
* next kernel with full memory.
*/
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820map initial_e820 __initdata;
+static struct e820map initial_e820_saved __initdata;
+struct e820map *e820 __refdata = &initial_e820;
+struct e820map *e820_saved __refdata = &initial_e820_saved;
/* For PCI or other memory-mapped resources */
unsigned long pci_mem_start = 0xaeedbabe;
@@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (type && ei->type != type)
continue;
@@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (type && ei->type != type)
continue;
@@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
void __init e820_add_region(u64 start, u64 size, int type)
{
- __e820_add_region(&e820, start, size, type);
+ __e820_add_region(e820, start, size, type);
}
static void __init e820_print_type(u32 type)
@@ -164,12 +166,12 @@ void __init e820_print_map(char *who)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820->nr_map; i++) {
printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
- (unsigned long long) e820.map[i].addr,
+ (unsigned long long) e820->map[i].addr,
(unsigned long long)
- (e820.map[i].addr + e820.map[i].size - 1));
- e820_print_type(e820.map[i].type);
+ (e820->map[i].addr + e820->map[i].size - 1));
+ e820_print_type(e820->map[i].type);
printk(KERN_CONT "\n");
}
}
@@ -388,11 +390,11 @@ static int __init __append_e820_map(struct e820entry *biosmap, int nr_map)
while (nr_map) {
u64 start = biosmap->addr;
u64 size = biosmap->size;
- u64 end = start + size;
+ u64 end = start + size - 1;
u32 type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
- if (start > end)
+ if (start > end && likely(size))
return -1;
e820_add_region(start, size, type);
@@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
unsigned new_type)
{
- return __e820_update_range(&e820, start, size, old_type, new_type);
+ return __e820_update_range(e820, start, size, old_type, new_type);
}
static u64 __init e820_update_range_saved(u64 start, u64 size,
unsigned old_type, unsigned new_type)
{
- return __e820_update_range(&e820_saved, start, size, old_type,
+ return __e820_update_range(e820_saved, start, size, old_type,
new_type);
}
@@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
e820_print_type(old_type);
printk(KERN_CONT "\n");
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
u64 final_start, final_end;
u64 ei_end;
@@ -566,15 +568,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
void __init update_e820(void)
{
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map))
+ if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map))
return;
printk(KERN_INFO "e820: modified physical RAM map:\n");
e820_print_map("modified");
}
static void __init update_e820_saved(void)
{
- sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map),
- &e820_saved.nr_map);
+ sanitize_e820_map(e820_saved->map, ARRAY_SIZE(e820_saved->map),
+ &e820_saved->nr_map);
}
#define MAX_GAP_END 0x100000000ull
/*
@@ -584,14 +586,14 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
unsigned long start_addr, unsigned long long end_addr)
{
unsigned long long last;
- int i = e820.nr_map;
+ int i = e820->nr_map;
int found = 0;
last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END;
while (--i >= 0) {
- unsigned long long start = e820.map[i].addr;
- unsigned long long end = start + e820.map[i].size;
+ unsigned long long start = e820->map[i].addr;
+ unsigned long long end = start + e820->map[i].size;
if (end < start_addr)
continue;
@@ -649,6 +651,33 @@ __init void e820_setup_gap(void)
gapstart, gapstart + gapsize - 1);
}
+/*
+ * Called late during init, in free_initmem().
+ *
+ * Initial e820 and e820_saved are largish __initdata arrays.
+ * Copy them to (usually much smaller) dynamically allocated area.
+ * This is done after all tweaks we ever do to them:
+ * all functions which modify them are __init functions,
+ * they won't exist after this point.
+ */
+__init void e820_reallocate_tables(void)
+{
+ struct e820map *n;
+ int size;
+
+ size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820, size);
+ e820 = n;
+
+ size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map;
+ n = kmalloc(size, GFP_KERNEL);
+ BUG_ON(!n);
+ memcpy(n, e820_saved, size);
+ e820_saved = n;
+}
+
/**
* Because of the size limitation of struct boot_params, only first
* 128 E820 memory entries are passed to kernel via
@@ -665,7 +694,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
entries = sdata->len / sizeof(struct e820entry);
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
early_memunmap(sdata, data_len);
printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
@@ -686,8 +715,8 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
int i;
unsigned long pfn = 0;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (pfn < PFN_UP(ei->addr))
register_nosave_region(pfn, PFN_UP(ei->addr));
@@ -712,8 +741,8 @@ static int __init e820_mark_nvs_memory(void)
{
int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
if (ei->type == E820_NVS)
acpi_nvs_register(ei->addr, ei->size);
@@ -754,22 +783,18 @@ u64 __init early_reserve_e820(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
{
int i;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
unsigned long start_pfn;
unsigned long end_pfn;
- /*
- * Persistent memory is accounted as ram for purposes of
- * establishing max_pfn and mem_map.
- */
- if (ei->type != E820_RAM && ei->type != E820_PRAM)
+ if (ei->type != type)
continue;
start_pfn = ei->addr >> PAGE_SHIFT;
@@ -794,15 +819,15 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
}
unsigned long __init e820_end_of_ram_pfn(void)
{
- return e820_end_pfn(MAX_ARCH_PFN);
+ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
}
unsigned long __init e820_end_of_low_ram_pfn(void)
{
- return e820_end_pfn(1UL << (32-PAGE_SHIFT));
+ return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM);
}
-static void early_panic(char *msg)
+static void __init early_panic(char *msg)
{
early_printk(msg);
panic(msg);
@@ -856,7 +881,7 @@ static int __init parse_memmap_one(char *p)
*/
saved_max_pfn = e820_end_of_ram_pfn();
#endif
- e820.nr_map = 0;
+ e820->nr_map = 0;
userdef = 1;
return 0;
}
@@ -903,8 +928,8 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
if (userdef) {
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map),
- &e820.nr_map) < 0)
+ if (sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map),
+ &e820->nr_map) < 0)
early_panic("Invalid user supplied memory map");
printk(KERN_INFO "e820: user-defined physical RAM map:\n");
@@ -912,7 +937,7 @@ void __init finish_e820_parsing(void)
}
}
-static const char *e820_type_to_string(int e820_type)
+static const char *__init e820_type_to_string(int e820_type)
{
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -926,7 +951,7 @@ static const char *e820_type_to_string(int e820_type)
}
}
-static unsigned long e820_type_to_iomem_type(int e820_type)
+static unsigned long __init e820_type_to_iomem_type(int e820_type)
{
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -942,7 +967,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type)
}
}
-static unsigned long e820_type_to_iores_desc(int e820_type)
+static unsigned long __init e820_type_to_iores_desc(int e820_type)
{
switch (e820_type) {
case E820_ACPI:
@@ -961,7 +986,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type)
}
}
-static bool do_mark_busy(u32 type, struct resource *res)
+static bool __init do_mark_busy(u32 type, struct resource *res)
{
/* this is the legacy bios/dos rom-shadow + mmio region */
if (res->start < (1ULL<<20))
@@ -991,35 +1016,35 @@ void __init e820_reserve_resources(void)
struct resource *res;
u64 end;
- res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
+ res = alloc_bootmem(sizeof(struct resource) * e820->nr_map);
e820_res = res;
- for (i = 0; i < e820.nr_map; i++) {
- end = e820.map[i].addr + e820.map[i].size - 1;
+ for (i = 0; i < e820->nr_map; i++) {
+ end = e820->map[i].addr + e820->map[i].size - 1;
if (end != (resource_size_t)end) {
res++;
continue;
}
- res->name = e820_type_to_string(e820.map[i].type);
- res->start = e820.map[i].addr;
+ res->name = e820_type_to_string(e820->map[i].type);
+ res->start = e820->map[i].addr;
res->end = end;
- res->flags = e820_type_to_iomem_type(e820.map[i].type);
- res->desc = e820_type_to_iores_desc(e820.map[i].type);
+ res->flags = e820_type_to_iomem_type(e820->map[i].type);
+ res->desc = e820_type_to_iores_desc(e820->map[i].type);
/*
* don't register the region that could be conflicted with
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (do_mark_busy(e820.map[i].type, res)) {
+ if (do_mark_busy(e820->map[i].type, res)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
res++;
}
- for (i = 0; i < e820_saved.nr_map; i++) {
- struct e820entry *entry = &e820_saved.map[i];
+ for (i = 0; i < e820_saved->nr_map; i++) {
+ struct e820entry *entry = &e820_saved->map[i];
firmware_map_add_early(entry->addr,
entry->addr + entry->size,
e820_type_to_string(entry->type));
@@ -1027,7 +1052,7 @@ void __init e820_reserve_resources(void)
}
/* How much should we pad RAM ending depending on where it is? */
-static unsigned long ram_alignment(resource_size_t pos)
+static unsigned long __init ram_alignment(resource_size_t pos)
{
unsigned long mb = pos >> 20;
@@ -1051,7 +1076,7 @@ void __init e820_reserve_resources_late(void)
struct resource *res;
res = e820_res;
- for (i = 0; i < e820.nr_map; i++) {
+ for (i = 0; i < e820->nr_map; i++) {
if (!res->parent && res->end)
insert_resource_expand_to_fit(&iomem_resource, res);
res++;
@@ -1061,8 +1086,8 @@ void __init e820_reserve_resources_late(void)
* Try to bump up RAM regions to reasonable boundaries to
* avoid stolen RAM:
*/
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *entry = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *entry = &e820->map[i];
u64 start, end;
if (entry->type != E820_RAM)
@@ -1110,7 +1135,7 @@ char *__init default_machine_specific_memory_setup(void)
who = "BIOS-e801";
}
- e820.nr_map = 0;
+ e820->nr_map = 0;
e820_add_region(0, LOWMEMSIZE(), E820_RAM);
e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
}
@@ -1124,7 +1149,7 @@ void __init setup_memory_map(void)
char *who;
who = x86_init.resources.memory_setup();
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ memcpy(e820_saved, e820, sizeof(struct e820map));
printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n");
e820_print_map(who);
}
@@ -1141,8 +1166,8 @@ void __init memblock_x86_fill(void)
*/
memblock_allow_resize();
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ struct e820entry *ei = &e820->map[i];
end = ei->addr + ei->size;
if (end != (resource_size_t)end)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index f306698a4cb4..6a08e25a48d8 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -547,7 +547,7 @@ intel_graphics_stolen(int num, int slot, int func,
/* Mark this space as reserved */
e820_add_region(base, size, E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
}
static void __init intel_graphics_quirks(int num, int slot, int func)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 3fc03a09a93b..47004010ad5d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -12,6 +12,7 @@
#include <asm/traps.h>
#include <linux/hardirq.h>
+#include <linux/pkeys.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/fpu.h>
@@ -505,6 +506,9 @@ static inline void copy_init_fpstate_to_fpregs(void)
copy_kernel_to_fxregs(&init_fpstate.fxsave);
else
copy_kernel_to_fregs(&init_fpstate.fsave);
+
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ copy_init_pkru_to_fpregs();
}
/*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 93982aebb398..2f2b8c7ccb85 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -317,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void)
on_boot_cpu = 0;
WARN_ON_FPU(current->thread.fpu.fpstate_active);
- current_thread_info()->status = 0;
if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE;
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 01567aa87503..095ef7ddd6ae 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -5,6 +5,7 @@
*/
#include <linux/compat.h>
#include <linux/cpu.h>
+#include <linux/mman.h>
#include <linux/pkeys.h>
#include <asm/fpu/api.h>
@@ -73,6 +74,8 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
setup_clear_cpu_cap(X86_FEATURE_PKU);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
}
/*
@@ -866,9 +869,10 @@ const void *get_xsave_field_ptr(int xsave_state)
return get_xsave_addr(&fpu->state.xsave, xsave_state);
}
+#ifdef CONFIG_ARCH_HAS_PKEYS
+
#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
-
/*
* This will go out and modify PKRU register to set the access
* rights for @pkey to @init_val.
@@ -914,6 +918,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
return 0;
}
+#endif /* ! CONFIG_ARCH_HAS_PKEYS */
/*
* This is similar to user_regset_copyout(), but will not add offset to
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d036cfb4495d..8639bb2ae058 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
}
if (ftrace_push_return_trace(old, self_addr, &trace.depth,
- frame_pointer) == -EBUSY) {
+ frame_pointer, parent) == -EBUSY) {
*parent = old;
return;
}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 6f8902b0d151..b6b2f0264af3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -23,6 +23,7 @@
#include <asm/percpu.h>
#include <asm/nops.h>
#include <asm/bootparam.h>
+#include <asm/export.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -94,7 +95,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
*/
__HEAD
ENTRY(startup_32)
- movl pa(stack_start),%ecx
+ movl pa(initial_stack),%ecx
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
@@ -286,7 +287,7 @@ num_subarch_entries = (. - subarch_entries) / 4
* start_secondary().
*/
ENTRY(start_cpu0)
- movl stack_start, %ecx
+ movl initial_stack, %ecx
movl %ecx, %esp
jmp *(initial_code)
ENDPROC(start_cpu0)
@@ -307,7 +308,7 @@ ENTRY(startup_32_smp)
movl %eax,%es
movl %eax,%fs
movl %eax,%gs
- movl pa(stack_start),%ecx
+ movl pa(initial_stack),%ecx
movl %eax,%ss
leal -__PAGE_OFFSET(%ecx),%esp
@@ -673,6 +674,7 @@ ENTRY(empty_zero_page)
.fill 4096,1,0
ENTRY(swapper_pg_dir)
.fill 1024,4,0
+EXPORT_SYMBOL(empty_zero_page)
/*
* This starts the data section.
@@ -703,7 +705,7 @@ ENTRY(initial_page_table)
.data
.balign 4
-ENTRY(stack_start)
+ENTRY(initial_stack)
.long init_thread_union+THREAD_SIZE
__INITRODATA
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9f8efc9f0075..b4421cc191b0 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -21,6 +21,7 @@
#include <asm/percpu.h>
#include <asm/nops.h>
#include "../entry/calling.h"
+#include <asm/export.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
@@ -66,7 +67,7 @@ startup_64:
*/
/*
- * Setup stack for verify_cpu(). "-8" because stack_start is defined
+ * Setup stack for verify_cpu(). "-8" because initial_stack is defined
* this way, see below. Our best guess is a NULL ptr for stack
* termination heuristics and we don't want to break anything which
* might depend on it (kgdb, ...).
@@ -226,7 +227,7 @@ ENTRY(secondary_startup_64)
movq %rax, %cr0
/* Setup a boot time stack */
- movq stack_start(%rip), %rsp
+ movq initial_stack(%rip), %rsp
/* zero EFLAGS after setting rsp */
pushq $0
@@ -310,7 +311,7 @@ ENDPROC(secondary_startup_64)
* start_secondary().
*/
ENTRY(start_cpu0)
- movq stack_start(%rip),%rsp
+ movq initial_stack(%rip),%rsp
movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
pushq $__KERNEL_CS # set correct cs
@@ -319,17 +320,15 @@ ENTRY(start_cpu0)
ENDPROC(start_cpu0)
#endif
- /* SMP bootup changes these two */
+ /* Both SMP bootup and ACPI suspend change these variables */
__REFDATA
.balign 8
GLOBAL(initial_code)
.quad x86_64_start_kernel
GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
-
- GLOBAL(stack_start)
+ GLOBAL(initial_stack)
.quad init_thread_union+THREAD_SIZE-8
- .word 0
__FINITDATA
bad_address:
@@ -488,10 +487,12 @@ early_gdt_descr_base:
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
+EXPORT_SYMBOL(phys_base)
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
+EXPORT_SYMBOL(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c6dfd801df97..274fab99169d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -756,10 +756,104 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd)
/*
* Clock source related code
*/
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+/*
+ * Reading the HPET counter is a very slow operation. If a large number of
+ * CPUs are trying to access the HPET counter simultaneously, it can cause
+ * massive delay and slow down system performance dramatically. This may
+ * happen when HPET is the default clock source instead of TSC. For a
+ * really large system with hundreds of CPUs, the slowdown may be so
+ * severe that it may actually crash the system because of a NMI watchdog
+ * soft lockup, for example.
+ *
+ * If multiple CPUs are trying to access the HPET counter at the same time,
+ * we don't actually need to read the counter multiple times. Instead, the
+ * other CPUs can use the counter value read by the first CPU in the group.
+ *
+ * This special feature is only enabled on x86-64 systems. It is unlikely
+ * that 32-bit x86 systems will have enough CPUs to require this feature
+ * with its associated locking overhead. And we also need 64-bit atomic
+ * read.
+ *
+ * The lock and the hpet value are stored together and can be read in a
+ * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t
+ * is 32 bits in size.
+ */
+union hpet_lock {
+ struct {
+ arch_spinlock_t lock;
+ u32 value;
+ };
+ u64 lockval;
+};
+
+static union hpet_lock hpet __cacheline_aligned = {
+ { .lock = __ARCH_SPIN_LOCK_UNLOCKED, },
+};
+
+static cycle_t read_hpet(struct clocksource *cs)
+{
+ unsigned long flags;
+ union hpet_lock old, new;
+
+ BUILD_BUG_ON(sizeof(union hpet_lock) != 8);
+
+ /*
+ * Read HPET directly if in NMI.
+ */
+ if (in_nmi())
+ return (cycle_t)hpet_readl(HPET_COUNTER);
+
+ /*
+ * Read the current state of the lock and HPET value atomically.
+ */
+ old.lockval = READ_ONCE(hpet.lockval);
+
+ if (arch_spin_is_locked(&old.lock))
+ goto contended;
+
+ local_irq_save(flags);
+ if (arch_spin_trylock(&hpet.lock)) {
+ new.value = hpet_readl(HPET_COUNTER);
+ /*
+ * Use WRITE_ONCE() to prevent store tearing.
+ */
+ WRITE_ONCE(hpet.value, new.value);
+ arch_spin_unlock(&hpet.lock);
+ local_irq_restore(flags);
+ return (cycle_t)new.value;
+ }
+ local_irq_restore(flags);
+
+contended:
+ /*
+ * Contended case
+ * --------------
+ * Wait until the HPET value change or the lock is free to indicate
+ * its value is up-to-date.
+ *
+ * It is possible that old.value has already contained the latest
+ * HPET value while the lock holder was in the process of releasing
+ * the lock. Checking for lock state change will enable us to return
+ * the value immediately instead of waiting for the next HPET reader
+ * to come along.
+ */
+ do {
+ cpu_relax();
+ new.lockval = READ_ONCE(hpet.lockval);
+ } while ((new.value == old.value) && arch_spin_is_locked(&new.lock));
+
+ return (cycle_t)new.value;
+}
+#else
+/*
+ * For UP or 32-bit.
+ */
static cycle_t read_hpet(struct clocksource *cs)
{
return (cycle_t)hpet_readl(HPET_COUNTER);
}
+#endif
static struct clocksource clocksource_hpet = {
.name = "hpet",
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
deleted file mode 100644
index 1f9b878ef5ef..000000000000
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <linux/export.h>
-#include <linux/spinlock_types.h>
-
-#include <asm/checksum.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/ftrace.h>
-
-#ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
-EXPORT_SYMBOL(mcount);
-#endif
-
-/*
- * Note, this is a prototype to get at the symbol for
- * the export, but dont use it from C code, it is used
- * by assembly code and is not using C calling convention!
- */
-#ifndef CONFIG_X86_CMPXCHG64
-extern void cmpxchg8b_emu(void);
-EXPORT_SYMBOL(cmpxchg8b_emu);
-#endif
-
-/* Networking helper routines. */
-EXPORT_SYMBOL(csum_partial_copy_generic);
-
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
-
-EXPORT_SYMBOL(strstr);
-
-EXPORT_SYMBOL(csum_partial);
-EXPORT_SYMBOL(empty_zero_page);
-
-#ifdef CONFIG_PREEMPT
-EXPORT_SYMBOL(___preempt_schedule);
-EXPORT_SYMBOL(___preempt_schedule_notrace);
-#endif
-
-EXPORT_SYMBOL(__sw_hweight32);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 4a7903714065..9ebd0b0e73d9 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -40,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (user_mode(regs))
return;
- if (regs->sp >= curbase + sizeof(struct thread_info) +
- sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+ if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
regs->sp <= curbase + THREAD_SIZE)
return;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index f2356bda2b05..3407b148c240 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -99,14 +99,14 @@ static int setup_e820_entries(struct boot_params *params)
{
unsigned int nr_e820_entries;
- nr_e820_entries = e820_saved.nr_map;
+ nr_e820_entries = e820_saved->nr_map;
/* TODO: Pass entries more than E820MAX in bootparams setup data */
if (nr_e820_entries > E820MAX)
nr_e820_entries = E820MAX;
params->e820_entries = nr_e820_entries;
- memcpy(&params->e820_map, &e820_saved.map,
+ memcpy(&params->e820_map, &e820_saved->map,
nr_e820_entries * sizeof(struct e820entry));
return 0;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 04cde527d728..8e36f249646e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -50,6 +50,7 @@
#include <asm/apicdef.h>
#include <asm/apic.h>
#include <asm/nmi.h>
+#include <asm/switch_to.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
@@ -166,21 +167,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_DX] = 0;
gdb_regs[GDB_SI] = 0;
gdb_regs[GDB_DI] = 0;
- gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp;
+ gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp;
#ifdef CONFIG_X86_32
gdb_regs[GDB_DS] = __KERNEL_DS;
gdb_regs[GDB_ES] = __KERNEL_DS;
gdb_regs[GDB_PS] = 0;
gdb_regs[GDB_CS] = __KERNEL_CS;
- gdb_regs[GDB_PC] = p->thread.ip;
gdb_regs[GDB_SS] = __KERNEL_DS;
gdb_regs[GDB_FS] = 0xFFFF;
gdb_regs[GDB_GS] = 0xFFFF;
#else
- gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
+ gdb_regs32[GDB_PS] = 0;
gdb_regs32[GDB_CS] = __KERNEL_CS;
gdb_regs32[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_R8] = 0;
gdb_regs[GDB_R9] = 0;
gdb_regs[GDB_R10] = 0;
@@ -190,6 +189,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_R14] = 0;
gdb_regs[GDB_R15] = 0;
#endif
+ gdb_regs[GDB_PC] = 0;
gdb_regs[GDB_SP] = p->thread.sp;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 7847e5c0e0b5..d9d8d16b69db 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -45,11 +45,12 @@
#include <linux/slab.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
-#include <linux/module.h>
+#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
#include <linux/frame.h>
+#include <linux/kasan.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -1057,9 +1058,10 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
* tailcall optimization. So, to be absolutely safe
* we also save and restore enough stack bytes to cover
* the argument area.
+ * Use __memcpy() to avoid KASAN stack out-of-bounds reports as we copy
+ * raw stack chunk with redzones:
*/
- memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr,
- MIN_STACK_SIZE(addr));
+ __memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, MIN_STACK_SIZE(addr));
regs->flags &= ~X86_EFLAGS_IF;
trace_hardirqs_off();
regs->ip = (unsigned long)(jp->entry);
@@ -1080,6 +1082,9 @@ void jprobe_return(void)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* Unpoison stack redzones in the frames we are going to jump over. */
+ kasan_unpoison_stack_above_sp_to(kcb->jprobe_saved_sp);
+
asm volatile (
#ifdef CONFIG_X86_64
" xchg %%rbx,%%rsp \n"
@@ -1118,7 +1123,7 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
/* It's OK to start function graph tracing again */
unpause_graph_tracing();
*regs = kcb->jprobe_saved_regs;
- memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp));
+ __memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp));
preempt_enable_no_resched();
return 1;
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 4425f593f0ec..3bb4c5f021f6 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -24,7 +24,7 @@
#include <linux/slab.h>
#include <linux/hardirq.h>
#include <linux/preempt.h>
-#include <linux/module.h>
+#include <linux/extable.h>
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
index c2bedaea11f7..4afc67f5facc 100644
--- a/arch/x86/kernel/ksysfs.c
+++ b/arch/x86/kernel/ksysfs.c
@@ -184,7 +184,7 @@ out:
static struct kobj_attribute type_attr = __ATTR_RO(type);
-static struct bin_attribute data_attr = {
+static struct bin_attribute data_attr __ro_after_init = {
.attr = {
.name = "data",
.mode = S_IRUGO,
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 1726c4c12336..edbbfc854e39 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -423,12 +423,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
kvm_spinlock_init();
}
-static void kvm_guest_cpu_online(void *dummy)
-{
- kvm_guest_cpu_init();
-}
-
-static void kvm_guest_cpu_offline(void *dummy)
+static void kvm_guest_cpu_offline(void)
{
kvm_disable_steal_time();
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -437,29 +432,21 @@ static void kvm_guest_cpu_offline(void *dummy)
apf_task_wake_all();
}
-static int kvm_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
+static int kvm_cpu_online(unsigned int cpu)
{
- int cpu = (unsigned long)hcpu;
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- case CPU_ONLINE_FROZEN:
- smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
- break;
- case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
+ local_irq_disable();
+ kvm_guest_cpu_init();
+ local_irq_enable();
+ return 0;
}
-static struct notifier_block kvm_cpu_notifier = {
- .notifier_call = kvm_cpu_notify,
-};
+static int kvm_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ kvm_guest_cpu_offline();
+ local_irq_enable();
+ return 0;
+}
#endif
static void __init kvm_apf_trap_init(void)
@@ -494,7 +481,9 @@ void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
- register_cpu_notifier(&kvm_cpu_notifier);
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
+ kvm_cpu_online, kvm_cpu_down_prepare) < 0)
+ pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
#else
kvm_guest_cpu_init();
#endif
@@ -575,9 +564,6 @@ static void kvm_kick_cpu(int cpu)
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}
-
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
#include <asm/qspinlock.h>
static void kvm_wait(u8 *ptr, u8 val)
@@ -606,243 +592,6 @@ out:
local_irq_restore(flags);
}
-#else /* !CONFIG_QUEUED_SPINLOCKS */
-
-enum kvm_contention_stat {
- TAKEN_SLOW,
- TAKEN_SLOW_PICKUP,
- RELEASED_SLOW,
- RELEASED_SLOW_KICKED,
- NR_CONTENTION_STATS
-};
-
-#ifdef CONFIG_KVM_DEBUG_FS
-#define HISTO_BUCKETS 30
-
-static struct kvm_spinlock_stats
-{
- u32 contention_stats[NR_CONTENTION_STATS];
- u32 histo_spin_blocked[HISTO_BUCKETS+1];
- u64 time_blocked;
-} spinlock_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- u8 ret;
- u8 old;
-
- old = READ_ONCE(zero_stats);
- if (unlikely(old)) {
- ret = cmpxchg(&zero_stats, old, 0);
- /* This ensures only one fellow resets the stat */
- if (ret == old)
- memset(&spinlock_stats, 0, sizeof(spinlock_stats));
- }
-}
-
-static inline void add_stats(enum kvm_contention_stat var, u32 val)
-{
- check_zero();
- spinlock_stats.contention_stats[var] += val;
-}
-
-
-static inline u64 spin_time_start(void)
-{
- return sched_clock();
-}
-
-static void __spin_time_accum(u64 delta, u32 *array)
-{
- unsigned index;
-
- index = ilog2(delta);
- check_zero();
-
- if (index < HISTO_BUCKETS)
- array[index]++;
- else
- array[HISTO_BUCKETS]++;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
- u32 delta;
-
- delta = sched_clock() - start;
- __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
- spinlock_stats.time_blocked += delta;
-}
-
-static struct dentry *d_spin_debug;
-static struct dentry *d_kvm_debug;
-
-static struct dentry *kvm_init_debugfs(void)
-{
- d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
- if (!d_kvm_debug)
- printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
-
- return d_kvm_debug;
-}
-
-static int __init kvm_spinlock_debugfs(void)
-{
- struct dentry *d_kvm;
-
- d_kvm = kvm_init_debugfs();
- if (d_kvm == NULL)
- return -ENOMEM;
-
- d_spin_debug = debugfs_create_dir("spinlocks", d_kvm);
-
- debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
-
- debugfs_create_u32("taken_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW]);
- debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
-
- debugfs_create_u32("released_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW]);
- debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
-
- debugfs_create_u64("time_blocked", 0444, d_spin_debug,
- &spinlock_stats.time_blocked);
-
- debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
- spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
-
- return 0;
-}
-fs_initcall(kvm_spinlock_debugfs);
-#else /* !CONFIG_KVM_DEBUG_FS */
-static inline void add_stats(enum kvm_contention_stat var, u32 val)
-{
-}
-
-static inline u64 spin_time_start(void)
-{
- return 0;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-}
-#endif /* CONFIG_KVM_DEBUG_FS */
-
-struct kvm_lock_waiting {
- struct arch_spinlock *lock;
- __ticket_t want;
-};
-
-/* cpus 'waiting' on a spinlock to become available */
-static cpumask_t waiting_cpus;
-
-/* Track spinlock on which a cpu is waiting */
-static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
-
-__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
-{
- struct kvm_lock_waiting *w;
- int cpu;
- u64 start;
- unsigned long flags;
- __ticket_t head;
-
- if (in_nmi())
- return;
-
- w = this_cpu_ptr(&klock_waiting);
- cpu = smp_processor_id();
- start = spin_time_start();
-
- /*
- * Make sure an interrupt handler can't upset things in a
- * partially setup state.
- */
- local_irq_save(flags);
-
- /*
- * The ordering protocol on this is that the "lock" pointer
- * may only be set non-NULL if the "want" ticket is correct.
- * If we're updating "want", we must first clear "lock".
- */
- w->lock = NULL;
- smp_wmb();
- w->want = want;
- smp_wmb();
- w->lock = lock;
-
- add_stats(TAKEN_SLOW, 1);
-
- /*
- * This uses set_bit, which is atomic but we should not rely on its
- * reordering gurantees. So barrier is needed after this call.
- */
- cpumask_set_cpu(cpu, &waiting_cpus);
-
- barrier();
-
- /*
- * Mark entry to slowpath before doing the pickup test to make
- * sure we don't deadlock with an unlocker.
- */
- __ticket_enter_slowpath(lock);
-
- /* make sure enter_slowpath, which is atomic does not cross the read */
- smp_mb__after_atomic();
-
- /*
- * check again make sure it didn't become free while
- * we weren't looking.
- */
- head = READ_ONCE(lock->tickets.head);
- if (__tickets_equal(head, want)) {
- add_stats(TAKEN_SLOW_PICKUP, 1);
- goto out;
- }
-
- /*
- * halt until it's our turn and kicked. Note that we do safe halt
- * for irq enabled case to avoid hang when lock info is overwritten
- * in irq spinlock slowpath and no spurious interrupt occur to save us.
- */
- if (arch_irqs_disabled_flags(flags))
- halt();
- else
- safe_halt();
-
-out:
- cpumask_clear_cpu(cpu, &waiting_cpus);
- w->lock = NULL;
- local_irq_restore(flags);
- spin_time_accum_blocked(start);
-}
-PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning);
-
-/* Kick vcpu waiting on @lock->head to reach value @ticket */
-static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
-{
- int cpu;
-
- add_stats(RELEASED_SLOW, 1);
- for_each_cpu(cpu, &waiting_cpus) {
- const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
- if (READ_ONCE(w->lock) == lock &&
- READ_ONCE(w->want) == ticket) {
- add_stats(RELEASED_SLOW_KICKED, 1);
- kvm_kick_cpu(cpu);
- break;
- }
- }
-}
-
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
-
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -854,16 +603,11 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
-#ifdef CONFIG_QUEUED_SPINLOCKS
__pv_init_lock_hash();
pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = kvm_wait;
pv_lock_ops.kick = kvm_kick_cpu;
-#else /* !CONFIG_QUEUED_SPINLOCKS */
- pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
- pv_lock_ops.unlock_kick = kvm_unlock_kick;
-#endif
}
static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 3692249a70f1..60b9949f1e65 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,7 +29,7 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
-static int kvmclock = 1;
+static int kvmclock __ro_after_init = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
static cycle_t kvm_sched_clock_offset;
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..e9d252d873aa
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,65 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+#include <asm/text-patching.h>
+
+/* Apply per-object alternatives. Based on x86 module_finalize() */
+void arch_klp_init_object_loaded(struct klp_patch *patch,
+ struct klp_object *obj)
+{
+ int cnt;
+ struct klp_modinfo *info;
+ Elf_Shdr *s, *alt = NULL, *para = NULL;
+ void *aseg, *pseg;
+ const char *objname;
+ char sec_objname[MODULE_NAME_LEN];
+ char secname[KSYM_NAME_LEN];
+
+ info = patch->mod->klp_info;
+ objname = obj->name ? obj->name : "vmlinux";
+
+ /* See livepatch core code for BUILD_BUG_ON() explanation */
+ BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+
+ for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
+ /* Apply per-object .klp.arch sections */
+ cnt = sscanf(info->secstrings + s->sh_name,
+ ".klp.arch.%55[^.].%127s",
+ sec_objname, secname);
+ if (cnt != 2)
+ continue;
+ if (strcmp(sec_objname, objname))
+ continue;
+ if (!strcmp(".altinstructions", secname))
+ alt = s;
+ if (!strcmp(".parainstructions", secname))
+ para = s;
+ }
+
+ if (alt) {
+ aseg = (void *) alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
+
+ if (para) {
+ pseg = (void *) para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }
+}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 5a294e48b185..8c1f218926d7 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -337,6 +337,9 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
+ VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET);
+ VMCOREINFO_VMALLOC_START(VMALLOC_START);
+ VMCOREINFO_VMEMMAP_START(VMEMMAP_START);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index 61924222a9e1..efe73aacf966 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -7,6 +7,7 @@
#include <linux/linkage.h>
#include <asm/ptrace.h>
#include <asm/ftrace.h>
+#include <asm/export.h>
.code64
@@ -294,6 +295,7 @@ trace:
jmp fgraph_trace
END(function_hook)
#endif /* CONFIG_DYNAMIC_FTRACE */
+EXPORT_SYMBOL(function_hook)
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 068c4a929de6..0f8d20497383 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -499,6 +499,9 @@ void __init default_get_smp_config(unsigned int early)
{
struct mpf_intel *mpf = mpf_found;
+ if (!smp_found_config)
+ return;
+
if (!mpf)
return;
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 1939a0269377..2c55a003b793 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,7 +8,6 @@
#include <asm/paravirt.h>
-#ifdef CONFIG_QUEUED_SPINLOCKS
__visible void __native_queued_spin_unlock(struct qspinlock *lock)
{
native_queued_spin_unlock(lock);
@@ -21,19 +20,13 @@ bool pv_is_native_spin_unlock(void)
return pv_lock_ops.queued_spin_unlock.func ==
__raw_callee_save___native_queued_spin_unlock;
}
-#endif
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
-#ifdef CONFIG_QUEUED_SPINLOCKS
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
-#else /* !CONFIG_QUEUED_SPINLOCKS */
- .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
- .unlock_kick = paravirt_nop,
-#endif /* !CONFIG_QUEUED_SPINLOCKS */
#endif /* SMP */
};
EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1acfd76e3e26..bbf3d5933eaa 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -332,7 +332,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
.read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
.write_cr4 = native_write_cr4,
#ifdef CONFIG_X86_64
.read_cr8 = native_read_cr8,
@@ -389,7 +388,7 @@ NOKPROBE_SYMBOL(native_load_idt);
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
#endif
-struct pv_mmu_ops pv_mmu_ops = {
+struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.read_cr2 = native_read_cr2,
.write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 158dc0650d5d..920c6ae08592 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,7 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
#endif
@@ -49,7 +49,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
start = start_pv_lock_ops_queued_spin_unlock;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index e70087a04cc8..bb3840cedb4f 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,7 +19,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
DEF_NATIVE(, mov64, "mov %rdi, %rax");
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
#endif
@@ -61,7 +61,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
-#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
start = start_pv_lock_ops_queued_spin_unlock;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 62c0b0ea2ce4..0888a879120f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,6 +32,7 @@
#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -301,7 +302,7 @@ void arch_cpu_idle(void)
/*
* We use this if we don't have any better idle routine..
*/
-void default_idle(void)
+void __cpuidle default_idle(void)
{
trace_cpu_idle_rcuidle(1, smp_processor_id());
safe_halt();
@@ -416,7 +417,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
* with interrupts enabled and no flags, which is backwards compatible with the
* original MWAIT implementation.
*/
-static void mwait_idle(void)
+static __cpuidle void mwait_idle(void)
{
if (!current_set_polling_and_test()) {
trace_cpu_idle_rcuidle(1, smp_processor_id());
@@ -508,8 +509,18 @@ unsigned long arch_align_stack(unsigned long sp)
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
- unsigned long range_end = mm->brk + 0x02000000;
- return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+ return randomize_page(mm->brk, 0x02000000);
+}
+
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+ struct inactive_task_frame *frame =
+ (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
+ return READ_ONCE_NOCHECK(frame->ret_addr);
}
/*
@@ -520,15 +531,18 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
*/
unsigned long get_wchan(struct task_struct *p)
{
- unsigned long start, bottom, top, sp, fp, ip;
+ unsigned long start, bottom, top, sp, fp, ip, ret = 0;
int count = 0;
if (!p || p == current || p->state == TASK_RUNNING)
return 0;
+ if (!try_get_task_stack(p))
+ return 0;
+
start = (unsigned long)task_stack_page(p);
if (!start)
- return 0;
+ goto out;
/*
* Layout of the stack page:
@@ -537,9 +551,7 @@ unsigned long get_wchan(struct task_struct *p)
* PADDING
* ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
* stack
- * ----------- bottom = start + sizeof(thread_info)
- * thread_info
- * ----------- start
+ * ----------- bottom = start
*
* The tasks stack pointer points at the location where the
* framepointer is stored. The data on the stack is:
@@ -550,20 +562,25 @@ unsigned long get_wchan(struct task_struct *p)
*/
top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
top -= 2 * sizeof(unsigned long);
- bottom = start + sizeof(struct thread_info);
+ bottom = start;
sp = READ_ONCE(p->thread.sp);
if (sp < bottom || sp > top)
- return 0;
+ goto out;
- fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+ fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
do {
if (fp < bottom || fp > top)
- return 0;
+ goto out;
ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
- if (!in_sched_functions(ip))
- return ip;
+ if (!in_sched_functions(ip)) {
+ ret = ip;
+ goto out;
+ }
fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
} while (count++ < 16 && p->state != TASK_RUNNING);
- return 0;
+
+out:
+ put_task_stack(p);
+ return ret;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d86be29c38c7..bd7be8efdc4c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,17 +55,6 @@
#include <asm/switch_to.h>
#include <asm/vm86.h>
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
-
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- return ((unsigned long *)tsk->thread.sp)[3];
-}
-
void __show_regs(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -101,7 +90,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = __read_cr4_safe();
+ cr4 = __read_cr4();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
@@ -133,35 +122,31 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *childregs = task_pt_regs(p);
+ struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
+ struct inactive_task_frame *frame = &fork_frame->frame;
struct task_struct *tsk;
int err;
- p->thread.sp = (unsigned long) childregs;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
p->thread.sp0 = (unsigned long) (childregs+1);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
- p->thread.ip = (unsigned long) ret_from_kernel_thread;
- task_user_gs(p) = __KERNEL_STACK_CANARY;
- childregs->ds = __USER_DS;
- childregs->es = __USER_DS;
- childregs->fs = __KERNEL_PERCPU;
- childregs->bx = sp; /* function */
- childregs->bp = arg;
- childregs->orig_ax = -1;
- childregs->cs = __KERNEL_CS | get_kernel_rpl();
- childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ frame->bx = sp; /* function */
+ frame->di = arg;
p->thread.io_bitmap_ptr = NULL;
return 0;
}
+ frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
if (sp)
childregs->sp = sp;
- p->thread.ip = (unsigned long) ret_from_fork;
task_user_gs(p) = get_user_gs(current_pt_regs());
p->thread.io_bitmap_ptr = NULL;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 63236d8f84bf..b3760b3c1ca0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,8 +49,7 @@
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
-
-asmlinkage extern void ret_from_fork(void);
+#include <asm/vdso.h>
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@@ -110,12 +109,13 @@ void __show_regs(struct pt_regs *regs, int all)
get_debugreg(d7, 7);
/* Only print out debug registers if they are in their non-default state. */
- if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
- (d6 == DR6_RESERVED) && (d7 == 0x400))
- return;
-
- printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
- printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+ (d6 == DR6_RESERVED) && (d7 == 0x400))) {
+ printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
+ d0, d1, d2);
+ printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
+ d3, d6, d7);
+ }
if (boot_cpu_has(X86_FEATURE_OSPKE))
printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
@@ -141,12 +141,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
{
int err;
struct pt_regs *childregs;
+ struct fork_frame *fork_frame;
+ struct inactive_task_frame *frame;
struct task_struct *me = current;
p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
- p->thread.sp = (unsigned long) childregs;
- set_tsk_thread_flag(p, TIF_FORK);
+ fork_frame = container_of(childregs, struct fork_frame, regs);
+ frame = &fork_frame->frame;
+ frame->bp = 0;
+ frame->ret_addr = (unsigned long) ret_from_fork;
+ p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
@@ -160,15 +165,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
- childregs->sp = (unsigned long)childregs;
- childregs->ss = __KERNEL_DS;
- childregs->bx = sp; /* function */
- childregs->bp = arg;
- childregs->orig_ax = -1;
- childregs->cs = __KERNEL_CS | get_kernel_rpl();
- childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ frame->bx = sp; /* function */
+ frame->r12 = arg;
return 0;
}
+ frame->bx = 0;
*childregs = *current_pt_regs();
childregs->ax = 0;
@@ -511,7 +512,7 @@ void set_personality_ia32(bool x32)
current->personality &= ~READ_IMPLIES_EXEC;
/* in_compat_syscall() uses the presence of the x32
syscall bit flag to determine compat status */
- current_thread_info()->status &= ~TS_COMPAT;
+ current->thread.status &= ~TS_COMPAT;
} else {
set_thread_flag(TIF_IA32);
clear_thread_flag(TIF_X32);
@@ -519,11 +520,24 @@ void set_personality_ia32(bool x32)
current->mm->context.ia32_compat = TIF_IA32;
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
- current_thread_info()->status |= TS_COMPAT;
+ current->thread.status |= TS_COMPAT;
}
}
EXPORT_SYMBOL_GPL(set_personality_ia32);
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+ int ret;
+
+ ret = map_vdso_once(image, addr);
+ if (ret)
+ return ret;
+
+ return (long)image->size;
+}
+#endif
+
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
{
int ret = 0;
@@ -577,6 +591,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
break;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_X86_X32_ABI
+ case ARCH_MAP_VDSO_X32:
+ return prctl_map_vdso(&vdso_image_x32, addr);
+# endif
+# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ case ARCH_MAP_VDSO_32:
+ return prctl_map_vdso(&vdso_image_32, addr);
+# endif
+ case ARCH_MAP_VDSO_64:
+ return prctl_map_vdso(&vdso_image_64, addr);
+#endif
+
default:
ret = -EINVAL;
break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f79576a541ff..0e63c0267f99 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs)
return sp;
prev_esp = (u32 *)(context);
- if (prev_esp)
- return (unsigned long)prev_esp;
+ if (*prev_esp)
+ return (unsigned long)*prev_esp;
return (unsigned long)regs;
}
@@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
*/
regs->orig_ax = value;
if (syscall_get_nr(child, regs) >= 0)
- task_thread_info(child)->status |= TS_I386_REGS_POKED;
+ child->thread.status |= TS_I386_REGS_POKED;
break;
case offsetof(struct user32, regs.eflags):
@@ -1250,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
#ifdef CONFIG_X86_64
-static struct user_regset x86_64_regsets[] __read_mostly = {
+static struct user_regset x86_64_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1291,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = {
#endif /* CONFIG_X86_64 */
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static struct user_regset x86_32_regsets[] __read_mostly = {
+static struct user_regset x86_32_regsets[] __ro_after_init = {
[REGSET_GENERAL] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1344,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = {
*/
u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
{
#ifdef CONFIG_X86_64
x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
@@ -1358,7 +1358,7 @@ void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
+ if (!user_64bit_mode(task_pt_regs(task)))
#endif
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
return &user_x86_32_view;
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 3599404e3089..5b2cc889ce34 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
do {
version = pvclock_read_begin(src);
- ret = __pvclock_read_cycles(src);
+ ret = __pvclock_read_cycles(src, rdtsc_ordered());
flags = src->flags;
} while (pvclock_read_retry(src, version));
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index cc457ff818ad..51402a7e4ca6 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
amd_disable_seq_and_redirect_scrub);
#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+#include <linux/jump_label.h>
+#include <asm/string_64.h>
+
+/* Ivy Bridge, Haswell, Broadwell */
+static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if (capid0 & 0x10)
+ static_branch_inc(&mcsafe_key);
+}
+
+/* Skylake */
+static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
+{
+ u32 capid0;
+
+ pci_read_config_dword(pdev, 0x84, &capid0);
+
+ if ((capid0 & 0xc0) == 0xc0)
+ static_branch_inc(&mcsafe_key);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap);
+#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 63bf27d972b7..e244c19a2451 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -705,7 +705,7 @@ static void native_machine_power_off(void)
tboot_shutdown(TB_SHUTDOWN_HALT);
}
-struct machine_ops machine_ops = {
+struct machine_ops machine_ops __ro_after_init = {
.power_off = native_machine_power_off,
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 80eab01c1a68..2408c1603438 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -27,8 +27,8 @@ static void remove_e820_regions(struct resource *avail)
int i;
struct e820entry *entry;
- for (i = 0; i < e820.nr_map; i++) {
- entry = &e820.map[i];
+ for (i = 0; i < e820->nr_map; i++) {
+ entry = &e820->map[i];
resource_clip(avail, entry->addr,
entry->addr + entry->size - 1);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0fa60f5f5a16..bbfbca5fea0c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -210,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data);
#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-__visible unsigned long mmu_cr4_features;
+__visible unsigned long mmu_cr4_features __ro_after_init;
#else
-__visible unsigned long mmu_cr4_features = X86_CR4_PAE;
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
#endif
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
@@ -458,8 +458,8 @@ static void __init e820_reserve_setup_data(void)
early_memunmap(data, sizeof(*data));
}
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- memcpy(&e820_saved, &e820, sizeof(struct e820map));
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+ memcpy(e820_saved, e820, sizeof(struct e820map));
printk(KERN_INFO "extended physical RAM map:\n");
e820_print_map("reserve setup_data");
}
@@ -763,7 +763,7 @@ static void __init trim_bios_range(void)
*/
e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
}
/* called before trim_bios_range() to spare extra sanitize */
@@ -1032,7 +1032,7 @@ void __init setup_arch(char **cmdline_p)
if (ppro_with_ram_bug()) {
e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
printk(KERN_INFO "fixed physical RAM map:\n");
e820_print_map("bad_ppro");
}
@@ -1096,19 +1096,19 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
memblock_x86_fill();
- if (efi_enabled(EFI_BOOT)) {
+ reserve_bios_regions();
+
+ if (efi_enabled(EFI_MEMMAP)) {
efi_fake_memmap();
efi_find_mirror();
- }
-
- reserve_bios_regions();
+ efi_esrt_init();
- /*
- * The EFI specification says that boot service code won't be called
- * after ExitBootServices(). This is, in fact, a lie.
- */
- if (efi_enabled(EFI_MEMMAP))
+ /*
+ * The EFI specification says that boot service code won't be
+ * called after ExitBootServices(). This is, in fact, a lie.
+ */
efi_reserve_boot_services();
+ }
/* preallocate 4k for mptable mpc */
early_reserve_e820_mpc_new();
@@ -1137,9 +1137,7 @@ void __init setup_arch(char **cmdline_p)
* auditing all the early-boot CR4 manipulation would be needed to
* rule it out.
*/
- if (boot_cpu_data.cpuid_level >= 0)
- /* A CPU has %cr4 if and only if it has CPUID. */
- mmu_cr4_features = __read_cr4();
+ mmu_cr4_features = __read_cr4();
memblock_set_current_limit(get_max_mapped());
@@ -1221,8 +1219,7 @@ void __init setup_arch(char **cmdline_p)
/*
* get boot-time SMP configuration:
*/
- if (smp_found_config)
- get_smp_config();
+ get_smp_config();
prefill_possible_map();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 7a40e068302d..2bbd27f89802 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
};
EXPORT_SYMBOL(__per_cpu_offset);
@@ -246,7 +246,7 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_X86_64
per_cpu(irq_stack_ptr, cpu) =
per_cpu(irq_stack_union.irq_stack, cpu) +
- IRQ_STACK_SIZE - 64;
+ IRQ_STACK_SIZE;
#endif
#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 04cb3212db2d..763af1d0de64 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -42,6 +42,7 @@
#include <asm/syscalls.h>
#include <asm/sigframe.h>
+#include <asm/signal.h>
#define COPY(x) do { \
get_user_ex(regs->x, &sc->x); \
@@ -547,7 +548,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
return -EFAULT;
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
- if (copy_siginfo_to_user32(&frame->info, &ksig->info))
+ if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
return -EFAULT;
}
@@ -660,20 +661,21 @@ badframe:
return 0;
}
-static inline int is_ia32_compat_frame(void)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
{
return IS_ENABLED(CONFIG_IA32_EMULATION) &&
- test_thread_flag(TIF_IA32);
+ ksig->ka.sa.sa_flags & SA_IA32_ABI;
}
-static inline int is_ia32_frame(void)
+static inline int is_ia32_frame(struct ksignal *ksig)
{
- return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame();
+ return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
}
-static inline int is_x32_frame(void)
+static inline int is_x32_frame(struct ksignal *ksig)
{
- return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+ return IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ ksig->ka.sa.sa_flags & SA_X32_ABI;
}
static int
@@ -684,12 +686,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
compat_sigset_t *cset = (compat_sigset_t *) set;
/* Set up the stack frame */
- if (is_ia32_frame()) {
+ if (is_ia32_frame(ksig)) {
if (ksig->ka.sa.sa_flags & SA_SIGINFO)
return ia32_setup_rt_frame(usig, ksig, cset, regs);
else
return ia32_setup_frame(usig, ksig, cset, regs);
- } else if (is_x32_frame()) {
+ } else if (is_x32_frame(ksig)) {
return x32_setup_rt_frame(ksig, cset, regs);
} else {
return __setup_rt_frame(ksig->sig, ksig, set, regs);
@@ -783,7 +785,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* than the tracee.
*/
#ifdef CONFIG_IA32_EMULATION
- if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
+ if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
return __NR_ia32_restart_syscall;
#endif
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index b44564bf86a8..ec1f756f9dc9 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -1,5 +1,6 @@
#include <linux/compat.h>
#include <linux/uaccess.h>
+#include <linux/ptrace.h>
/*
* The compat_siginfo_t structure and handing code is very easy
@@ -92,10 +93,28 @@ static inline void signal_compat_build_tests(void)
/* any new si_fields should be added here */
}
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+ /* Don't leak in-kernel non-uapi flags to user-space */
+ if (oact)
+ oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (!act)
+ return;
+
+ /* Don't let flags to be set from userspace */
+ act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+ if (in_ia32_syscall())
+ act->sa.sa_flags |= SA_IA32_ABI;
+ if (in_x32_syscall())
+ act->sa.sa_flags |= SA_X32_ABI;
+}
+
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
+ bool x32_ABI)
{
int err = 0;
- bool ia32 = test_thread_flag(TIF_IA32);
signal_compat_build_tests();
@@ -146,7 +165,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
put_user_ex(from->si_arch, &to->si_arch);
break;
case __SI_CHLD >> 16:
- if (ia32) {
+ if (!x32_ABI) {
put_user_ex(from->si_utime, &to->si_utime);
put_user_ex(from->si_stime, &to->si_stime);
} else {
@@ -180,6 +199,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
return err;
}
+/* from syscall's path, where we know the ABI */
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+ return __copy_siginfo_to_user32(to, from, in_x32_syscall());
+}
+
int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
{
int err = 0;
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 658777cf3851..c00cb64bc0a1 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -32,6 +32,8 @@
#include <asm/nmi.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
+#include <asm/kexec.h>
+
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -259,8 +261,10 @@ static inline void __smp_reschedule_interrupt(void)
__visible void smp_reschedule_interrupt(struct pt_regs *regs)
{
+ irq_enter();
ack_APIC_irq();
__smp_reschedule_interrupt();
+ irq_exit();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
@@ -342,6 +346,9 @@ struct smp_ops smp_ops = {
.smp_cpus_done = native_smp_cpus_done,
.stop_other_cpus = native_stop_other_cpus,
+#if defined(CONFIG_KEXEC_CORE)
+ .crash_stop_other_cpus = kdump_nmi_shootdown_cpus,
+#endif
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 4296beb8fdd3..42f5eb7b4f6c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
-static struct sched_domain_topology_level numa_inside_package_topology[] = {
+static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
@@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
#endif
{ NULL, },
};
+
+static struct sched_domain_topology_level x86_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
/*
- * set_sched_topology() sets the topology internal to a CPU. The
- * NUMA topologies are layered on top of it to build the full
- * system topology.
- *
- * If NUMA nodes are observed to occur within a CPU package, this
- * function should be called. It forces the sched domain code to
- * only use the SMT level for the CPU portion of the topology.
- * This essentially falls back to relying on NUMA information
- * from the SRAT table to describe the entire system topology
- * (except for hyperthreads).
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours and Intel Cluster-on-Die have this.
*/
-static void primarily_use_numa_for_topology(void)
-{
- set_sched_topology(numa_inside_package_topology);
-}
+static bool x86_has_numa_in_package;
void set_cpu_sibling_map(int cpu)
{
@@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
c->booted_cores = cpu_data(i).booted_cores;
}
if (match_die(c, o) && !topology_same_node(c, o))
- primarily_use_numa_for_topology();
+ x86_has_numa_in_package = true;
}
threads = cpumask_weight(topology_sibling_cpumask(cpu));
@@ -690,7 +691,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
* Give the other CPU some time to accept the IPI.
*/
udelay(200);
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
maxlvt = lapic_get_maxlvt();
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
@@ -717,7 +718,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Be paranoid about clearing APIC errors.
*/
- if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
@@ -756,7 +757,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
* Determine this based on the APIC version.
* If we don't have an integrated APIC, don't send the STARTUP IPIs.
*/
- if (APIC_INTEGRATED(apic_version[phys_apicid]))
+ if (APIC_INTEGRATED(boot_cpu_apic_version))
num_starts = 2;
else
num_starts = 0;
@@ -942,7 +943,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
per_cpu(cpu_current_top_of_stack, cpu) =
(unsigned long)task_stack_page(idle) + THREAD_SIZE;
#else
- clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
#endif
}
@@ -969,7 +969,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
- stack_start = idle->thread.sp;
+ initial_stack = idle->thread.sp;
/*
* Enable the espfix hack for this CPU
@@ -994,7 +994,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
/*
* Be paranoid about clearing APIC errors.
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ if (APIC_INTEGRATED(boot_cpu_apic_version)) {
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
}
@@ -1115,17 +1115,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
- /*
- * We have to walk the irq descriptors to setup the vector
- * space for the cpu which comes online. Prevent irq
- * alloc/free across the bringup.
- */
- irq_lock_sparse();
-
err = do_boot_cpu(apicid, cpu, tidle);
-
if (err) {
- irq_unlock_sparse();
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -1143,8 +1134,6 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
- irq_unlock_sparse();
-
return 0;
}
@@ -1249,7 +1238,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
/*
* If we couldn't find a local APIC, then get out of here now!
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
+ if (APIC_INTEGRATED(boot_cpu_apic_version) &&
!boot_cpu_has(X86_FEATURE_APIC)) {
if (!disable_apic) {
pr_err("BIOS bug, local APIC #%d not detected!...\n",
@@ -1304,6 +1293,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}
+
+ /*
+ * Set 'default' x86 topology, this matches default_topology() in that
+ * it has NUMA nodes as a topology level. See also
+ * native_smp_cpus_done().
+ *
+ * Must be done before set_cpus_sibling_map() is ran.
+ */
+ set_sched_topology(x86_topology);
+
set_cpu_sibling_map(0);
switch (smp_sanity_check(max_cpus)) {
@@ -1323,14 +1322,13 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
break;
}
- default_setup_apic_routing();
-
if (read_apic_id() != boot_cpu_physical_apicid) {
panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
read_apic_id(), boot_cpu_physical_apicid);
/* Or can we switch back to PIC here? */
}
+ default_setup_apic_routing();
cpu0_logical_apicid = apic_bsp_setup(false);
pr_info("CPU%d: ", 0);
@@ -1370,6 +1368,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done\n");
+ if (x86_has_numa_in_package)
+ set_sched_topology(x86_numa_in_package_topology);
+
nmi_selftest();
impress_friends();
setup_ioapic_dest();
@@ -1406,9 +1407,23 @@ __init void prefill_possible_map(void)
{
int i, possible;
- /* no processor from mptable or madt */
- if (!num_processors)
- num_processors = 1;
+ /* No boot processor was found in mptable or ACPI MADT */
+ if (!num_processors) {
+ if (boot_cpu_has(X86_FEATURE_APIC)) {
+ int apicid = boot_cpu_physical_apicid;
+ int cpu = hard_smp_processor_id();
+
+ pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu);
+
+ /* Make sure boot cpu is enumerated */
+ if (apic->cpu_present_to_apicid(0) == BAD_APICID &&
+ apic->apic_id_valid(apicid))
+ generic_processor_info(apicid, boot_cpu_apic_version);
+ }
+
+ if (!num_processors)
+ num_processors = 1;
+ }
i = setup_max_cpus ?: 1;
if (setup_possible_cpus == -1) {
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 4738f5e0f2ab..0653788026e2 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -8,80 +8,69 @@
#include <linux/export.h>
#include <linux/uaccess.h>
#include <asm/stacktrace.h>
+#include <asm/unwind.h>
-static int save_stack_stack(void *data, char *name)
+static int save_stack_address(struct stack_trace *trace, unsigned long addr,
+ bool nosched)
{
- return 0;
-}
-
-static int
-__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
-{
- struct stack_trace *trace = data;
-#ifdef CONFIG_FRAME_POINTER
- if (!reliable)
- return 0;
-#endif
if (nosched && in_sched_functions(addr))
return 0;
+
if (trace->skip > 0) {
trace->skip--;
return 0;
}
- if (trace->nr_entries < trace->max_entries) {
- trace->entries[trace->nr_entries++] = addr;
- return 0;
- } else {
- return -1; /* no more room, stop walking the stack */
- }
-}
-static int save_stack_address(void *data, unsigned long addr, int reliable)
-{
- return __save_stack_address(data, addr, reliable, false);
+ if (trace->nr_entries >= trace->max_entries)
+ return -1;
+
+ trace->entries[trace->nr_entries++] = addr;
+ return 0;
}
-static int
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+static void __save_stack_trace(struct stack_trace *trace,
+ struct task_struct *task, struct pt_regs *regs,
+ bool nosched)
{
- return __save_stack_address(data, addr, reliable, true);
-}
+ struct unwind_state state;
+ unsigned long addr;
-static const struct stacktrace_ops save_stack_ops = {
- .stack = save_stack_stack,
- .address = save_stack_address,
- .walk_stack = print_context_stack,
-};
+ if (regs)
+ save_stack_address(trace, regs->ip, nosched);
-static const struct stacktrace_ops save_stack_ops_nosched = {
- .stack = save_stack_stack,
- .address = save_stack_address_nosched,
- .walk_stack = print_context_stack,
-};
+ for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
+ unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr || save_stack_address(trace, addr, nosched))
+ break;
+ }
+
+ if (trace->nr_entries < trace->max_entries)
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
/*
* Save stack-backtrace addresses into a stack_trace buffer.
*/
void save_stack_trace(struct stack_trace *trace)
{
- dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ __save_stack_trace(trace, current, NULL, false);
}
EXPORT_SYMBOL_GPL(save_stack_trace);
void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
{
- dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ __save_stack_trace(trace, current, regs, false);
}
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
- dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = ULONG_MAX;
+ if (!try_get_task_stack(tsk))
+ return;
+
+ __save_stack_trace(trace, tsk, NULL, true);
+
+ put_task_stack(tsk);
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index c9a073866ca7..a23ce84a3f6c 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -57,7 +57,8 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
unsigned char opcode[15];
unsigned long addr = convert_ip_to_linear(child, regs);
- copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
+ copied = access_process_vm(child, addr, opcode, sizeof(opcode),
+ FOLL_FORCE);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
/* popf and iret */
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 10e0272d789a..a55ed63b9f91 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -101,7 +101,6 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
unsigned long *end)
{
if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
- unsigned long new_begin;
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
it to that. This means we need to move the
@@ -112,9 +111,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
*begin = 0x40000000;
*end = 0x80000000;
if (current->flags & PF_RANDOMIZE) {
- new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
- if (new_begin)
- *begin = new_begin;
+ *begin = randomize_page(*begin, 0x02000000);
}
} else {
*begin = current->mm->mmap_legacy_base;
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 654f6c66fe45..8402907825b0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -188,12 +188,12 @@ static int tboot_setup_sleep(void)
tboot->num_mac_regions = 0;
- for (i = 0; i < e820.nr_map; i++) {
- if ((e820.map[i].type != E820_RAM)
- && (e820.map[i].type != E820_RESERVED_KERN))
+ for (i = 0; i < e820->nr_map; i++) {
+ if ((e820->map[i].type != E820_RAM)
+ && (e820->map[i].type != E820_RESERVED_KERN))
continue;
- add_mac_region(e820.map[i].addr, e820.map[i].size);
+ add_mac_region(e820->map[i].addr, e820->map[i].size);
}
tboot->acpi_sinfo.kernel_s3_resume_vector =
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b70ca12dd389..bd4e3d4d3625 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(const char *message,
+ struct pt_regs *regs,
+ unsigned long fault_address)
+{
+ printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+ (void *)fault_address, current->stack,
+ (char *)current->stack + THREAD_SIZE - 1);
+ die(message, regs, 0);
+
+ /* Be absolutely certain we don't return. */
+ panic(message);
+}
+#endif
+
#ifdef CONFIG_X86_64
/* Runs on IST stack */
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+ unsigned long cr2;
+#endif
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_DF;
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * If we overflow the stack into a guard page, the CPU will fail
+ * to deliver #PF and will send #DF instead. Similarly, if we
+ * take any non-IST exception while too close to the bottom of
+ * the stack, the processor will get a page fault while
+ * delivering the exception and will generate a double fault.
+ *
+ * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+ * Page-Fault Exception (#PF):
+ *
+ * Processors update CR2 whenever a page fault is detected. If a
+ * second page fault occurs while an earlier page fault is being
+ * deliv- ered, the faulting linear address of the second fault will
+ * overwrite the contents of CR2 (replacing the previous
+ * address). These updates to CR2 occur even if the page fault
+ * results in a double fault or occurs during the delivery of a
+ * double fault.
+ *
+ * The logic below has a small possibility of incorrectly diagnosing
+ * some errors as stack overflows. For example, if the IDT or GDT
+ * gets corrupted such that #GP delivery fails due to a bad descriptor
+ * causing #GP and we hit this condition while CR2 coincidentally
+ * points to the stack guard page, we'll think we overflowed the
+ * stack. Given that we're going to panic one way or another
+ * if this happens, this isn't necessarily worth fixing.
+ *
+ * If necessary, we could improve the test by only diagnosing
+ * a stack overflow if the saved RSP points within 47 bytes of
+ * the bottom of the stack: if RSP == tsk_stack + 48 and we
+ * take an exception, the stack is already aligned and there
+ * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+ * possible error code, so a stack overflow would *not* double
+ * fault. With any less space left, exception delivery could
+ * fail, and, as a practical matter, we've overflowed the
+ * stack even if the actual trigger for the double fault was
+ * something else.
+ */
+ cr2 = read_cr2();
+ if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+ handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
#ifdef CONFIG_DOUBLEFAULT
df_debug(regs, error_code);
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 78b9cb5a26af..46b2f41f8b05 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -23,6 +23,7 @@
#include <asm/x86_init.h>
#include <asm/geode.h>
#include <asm/apic.h>
+#include <asm/intel-family.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -686,11 +687,16 @@ unsigned long native_calibrate_tsc(void)
if (crystal_khz == 0) {
switch (boot_cpu_data.x86_model) {
- case 0x4E: /* SKL */
- case 0x5E: /* SKL */
+ case INTEL_FAM6_SKYLAKE_MOBILE:
+ case INTEL_FAM6_SKYLAKE_DESKTOP:
+ case INTEL_FAM6_KABYLAKE_MOBILE:
+ case INTEL_FAM6_KABYLAKE_DESKTOP:
crystal_khz = 24000; /* 24.0 MHz */
break;
- case 0x5C: /* BXT */
+ case INTEL_FAM6_SKYLAKE_X:
+ crystal_khz = 25000; /* 25.0 MHz */
+ break;
+ case INTEL_FAM6_ATOM_GOLDMONT:
crystal_khz = 19200; /* 19.2 MHz */
break;
}
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
new file mode 100644
index 000000000000..a2456d4d286a
--- /dev/null
+++ b/arch/x86/kernel/unwind_frame.c
@@ -0,0 +1,93 @@
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ unsigned long addr;
+ unsigned long *addr_p = unwind_get_return_address_ptr(state);
+
+ if (unwind_done(state))
+ return 0;
+
+ addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
+ addr_p);
+
+ return __kernel_text_address(addr) ? addr : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+static bool update_stack_state(struct unwind_state *state, void *addr,
+ size_t len)
+{
+ struct stack_info *info = &state->stack_info;
+
+ /*
+ * If addr isn't on the current stack, switch to the next one.
+ *
+ * We may have to traverse multiple stacks to deal with the possibility
+ * that 'info->next_sp' could point to an empty stack and 'addr' could
+ * be on a subsequent stack.
+ */
+ while (!on_stack(info, addr, len))
+ if (get_stack_info(info->next_sp, state->task, info,
+ &state->stack_mask))
+ return false;
+
+ return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ unsigned long *next_bp;
+
+ if (unwind_done(state))
+ return false;
+
+ next_bp = (unsigned long *)*state->bp;
+
+ /* make sure the next frame's data is accessible */
+ if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
+ return false;
+
+ /* move to the next frame */
+ state->bp = next_bp;
+ return true;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+ state->task = task;
+
+ /* don't even attempt to start from user mode regs */
+ if (regs && user_mode(regs)) {
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return;
+ }
+
+ /* set up the starting stack frame */
+ state->bp = get_frame_pointer(task, regs);
+
+ /* initialize stack info and make sure the frame data is accessible */
+ get_stack_info(state->bp, state->task, &state->stack_info,
+ &state->stack_mask);
+ update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+
+ /*
+ * The caller can provide the address of the first frame directly
+ * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+ * to start unwinding at. Skip ahead until we reach it.
+ */
+ while (!unwind_done(state) &&
+ (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+ state->bp < first_frame))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
new file mode 100644
index 000000000000..9298993dc8b7
--- /dev/null
+++ b/arch/x86/kernel/unwind_guess.c
@@ -0,0 +1,53 @@
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return 0;
+
+ return ftrace_graph_ret_addr(state->task, &state->graph_idx,
+ *state->sp, state->sp);
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+ struct stack_info *info = &state->stack_info;
+
+ if (unwind_done(state))
+ return false;
+
+ do {
+ for (state->sp++; state->sp < info->end; state->sp++)
+ if (__kernel_text_address(*state->sp))
+ return true;
+
+ state->sp = info->next_sp;
+
+ } while (!get_stack_info(state->sp, state->task, info,
+ &state->stack_mask));
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+ struct pt_regs *regs, unsigned long *first_frame)
+{
+ memset(state, 0, sizeof(*state));
+
+ state->task = task;
+ state->sp = first_frame;
+
+ get_stack_info(first_frame, state->task, &state->stack_info,
+ &state->stack_mask);
+
+ if (!__kernel_text_address(*first_frame))
+ unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9297a002d8e5..dbf67f64d5ec 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,6 +97,7 @@ SECTIONS
_stext = .;
TEXT_TEXT
SCHED_TEXT
+ CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
ENTRY_TEXT
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
deleted file mode 100644
index 95e49f6e4fc3..000000000000
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Exports for assembly files.
- All C exports should go in the respective C files. */
-
-#include <linux/export.h>
-#include <linux/spinlock_types.h>
-#include <linux/smp.h>
-
-#include <net/checksum.h>
-
-#include <asm/processor.h>
-#include <asm/pgtable.h>
-#include <asm/uaccess.h>
-#include <asm/desc.h>
-#include <asm/ftrace.h>
-
-#ifdef CONFIG_FUNCTION_TRACER
-/* mcount and __fentry__ are defined in assembly */
-#ifdef CC_USING_FENTRY
-EXPORT_SYMBOL(__fentry__);
-#else
-EXPORT_SYMBOL(mcount);
-#endif
-#endif
-
-EXPORT_SYMBOL(__get_user_1);
-EXPORT_SYMBOL(__get_user_2);
-EXPORT_SYMBOL(__get_user_4);
-EXPORT_SYMBOL(__get_user_8);
-EXPORT_SYMBOL(__put_user_1);
-EXPORT_SYMBOL(__put_user_2);
-EXPORT_SYMBOL(__put_user_4);
-EXPORT_SYMBOL(__put_user_8);
-
-EXPORT_SYMBOL(copy_user_generic_string);
-EXPORT_SYMBOL(copy_user_generic_unrolled);
-EXPORT_SYMBOL(copy_user_enhanced_fast_string);
-EXPORT_SYMBOL(__copy_user_nocache);
-EXPORT_SYMBOL(_copy_from_user);
-EXPORT_SYMBOL(_copy_to_user);
-
-EXPORT_SYMBOL_GPL(memcpy_mcsafe);
-
-EXPORT_SYMBOL(copy_page);
-EXPORT_SYMBOL(clear_page);
-
-EXPORT_SYMBOL(csum_partial);
-
-EXPORT_SYMBOL(__sw_hweight32);
-EXPORT_SYMBOL(__sw_hweight64);
-
-/*
- * Export string functions. We normally rely on gcc builtin for most of these,
- * but gcc sometimes decides not to inline them.
- */
-#undef memcpy
-#undef memset
-#undef memmove
-
-extern void *__memset(void *, int, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
-extern void *__memmove(void *, const void *, __kernel_size_t);
-extern void *memset(void *, int, __kernel_size_t);
-extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *memmove(void *, const void *, __kernel_size_t);
-
-EXPORT_SYMBOL(__memset);
-EXPORT_SYMBOL(__memcpy);
-EXPORT_SYMBOL(__memmove);
-
-EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(memmove);
-
-#ifndef CONFIG_DEBUG_VIRTUAL
-EXPORT_SYMBOL(phys_base);
-#endif
-EXPORT_SYMBOL(empty_zero_page);
-#ifndef CONFIG_PARAVIRT
-EXPORT_SYMBOL(native_load_gs_index);
-#endif
-
-#ifdef CONFIG_PREEMPT
-EXPORT_SYMBOL(___preempt_schedule);
-EXPORT_SYMBOL(___preempt_schedule_notrace);
-#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 76c5e52436c4..0bd9f1287f39 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -91,7 +91,7 @@ struct x86_cpuinit_ops x86_cpuinit = {
static void default_nmi_init(void) { };
static int default_i8042_detect(void) { return 1; };
-struct x86_platform_ops x86_platform = {
+struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu,
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
@@ -108,7 +108,7 @@ struct x86_platform_ops x86_platform = {
EXPORT_SYMBOL_GPL(x86_platform);
#if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi = {
+struct x86_msi_ops x86_msi __ro_after_init = {
.setup_msi_irqs = native_setup_msi_irqs,
.teardown_msi_irq = native_teardown_msi_irq,
.teardown_msi_irqs = default_teardown_msi_irqs,
@@ -137,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
}
#endif
-struct x86_io_apic_ops x86_io_apic_ops = {
+struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = {
.read = native_io_apic_read,
.disable = native_disable_io_apic,
};
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 464fa477afbf..3bff20710471 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,7 +13,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
- hyperv.o page_track.o
+ hyperv.o page_track.o debugfs.o
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 3235e0fe7792..afa7bbb596cd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
- F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
+ F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+ F(AVX512BW) | F(AVX512VL);
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
new file mode 100644
index 000000000000..c19c7ede9bd6
--- /dev/null
+++ b/arch/x86/kvm/debugfs.c
@@ -0,0 +1,69 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * Copyright 2016 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/debugfs.h>
+
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+ return true;
+}
+
+static int vcpu_get_tsc_offset(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_offset;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n");
+
+static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data;
+ *val = vcpu->arch.tsc_scaling_ratio;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n");
+
+static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val)
+{
+ *val = kvm_tsc_scaling_ratio_frac_bits;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n");
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+ struct dentry *ret;
+
+ ret = debugfs_create_file("tsc-offset", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_offset_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ if (kvm_has_tsc_control) {
+ ret = debugfs_create_file("tsc-scaling-ratio", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_fops);
+ if (!ret)
+ return -ENOMEM;
+ ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444,
+ vcpu->debugfs_dentry,
+ vcpu, &vcpu_tsc_scaling_frac_fops);
+ if (!ret)
+ return -ENOMEM;
+
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 01bd7b7a6866..42b1c83741c8 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -386,7 +386,21 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
static u64 get_time_ref_counter(struct kvm *kvm)
{
- return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ struct kvm_vcpu *vcpu;
+ u64 tsc;
+
+ /*
+ * The guest has not set up the TSC page or the clock isn't
+ * stable, fall back to get_kvmclock_ns.
+ */
+ if (!hv->tsc_ref.tsc_sequence)
+ return div_u64(get_kvmclock_ns(kvm), 100);
+
+ vcpu = kvm_get_vcpu(kvm, 0);
+ tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+ + hv->tsc_ref.tsc_offset;
}
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -756,6 +770,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
return 0;
}
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ *
+ * Hyper-V formula:
+ * nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ * + system_time
+ * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ * + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ * nsec/100 = ticks * scale / 2^64
+ * - tsc_timestamp * scale / 2^64
+ * + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ * offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+ HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+ u64 max_mul;
+
+ if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+ return false;
+
+ /*
+ * check if scale would overflow, if so we use the time ref counter
+ * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+ * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+ * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+ */
+ max_mul = 100ull << (32 - hv_clock->tsc_shift);
+ if (hv_clock->tsc_to_system_mul >= max_mul)
+ return false;
+
+ /*
+ * Otherwise compute the scale and offset according to the formulas
+ * derived above.
+ */
+ tsc_ref->tsc_scale =
+ mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+ hv_clock->tsc_to_system_mul,
+ 100);
+
+ tsc_ref->tsc_offset = hv_clock->system_time;
+ do_div(tsc_ref->tsc_offset, 100);
+ tsc_ref->tsc_offset -=
+ mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+ return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock)
+{
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+ u32 tsc_seq;
+ u64 gfn;
+
+ BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+ BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ return;
+
+ gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ /*
+ * Because the TSC parameters only vary when there is a
+ * change in the master clock, do not bother with caching.
+ */
+ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+ &tsc_seq, sizeof(tsc_seq))))
+ return;
+
+ /*
+ * While we're computing and writing the parameters, force the
+ * guest to use the time reference count MSR.
+ */
+ hv->tsc_ref.tsc_sequence = 0;
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+ return;
+
+ if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+ return;
+
+ /* Ensure sequence is zero before writing the rest of the struct. */
+ smp_wmb();
+ if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+ return;
+
+ /*
+ * Now switch to the TSC page mechanism by writing the sequence.
+ */
+ tsc_seq++;
+ if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+ tsc_seq = 1;
+
+ /* Write the struct entirely before the non-zero sequence. */
+ smp_wmb();
+
+ hv->tsc_ref.tsc_sequence = tsc_seq;
+ kvm_write_guest(kvm, gfn_to_gpa(gfn),
+ &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
bool host)
{
@@ -793,23 +930,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
mark_page_dirty(kvm, gfn);
break;
}
- case HV_X64_MSR_REFERENCE_TSC: {
- u64 gfn;
- HV_REFERENCE_TSC_PAGE tsc_ref;
-
- memset(&tsc_ref, 0, sizeof(tsc_ref));
+ case HV_X64_MSR_REFERENCE_TSC:
hv->hv_tsc_page = data;
- if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
- break;
- gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(
- kvm,
- gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
- &tsc_ref, sizeof(tsc_ref)))
- return 1;
- mark_page_dirty(kvm, gfn);
+ if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
break;
- }
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
return kvm_hv_msr_set_crash_data(vcpu,
msr - HV_X64_MSR_CRASH_P0,
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 60eccd4bd1d3..cd1119538add 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+ struct pvclock_vcpu_time_info *hv_clock);
+
#endif
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 5fb6c620180e..16a7134eedac 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
*/
smp_mb();
if (atomic_dec_if_positive(&ps->pending) > 0)
- queue_kthread_work(&pit->worker, &pit->expired);
+ kthread_queue_work(&pit->worker, &pit->expired);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -233,7 +233,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
static void destroy_pit_timer(struct kvm_pit *pit)
{
hrtimer_cancel(&pit->pit_state.timer);
- flush_kthread_work(&pit->expired);
+ kthread_flush_work(&pit->expired);
}
static void pit_do_work(struct kthread_work *work)
@@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
if (atomic_read(&ps->reinject))
atomic_inc(&ps->pending);
- queue_kthread_work(&pt->worker, &pt->expired);
+ kthread_queue_work(&pt->worker, &pt->expired);
if (ps->is_periodic) {
hrtimer_add_expires_ns(&ps->timer, ps->period);
@@ -324,7 +324,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&ps->timer);
- flush_kthread_work(&pit->expired);
+ kthread_flush_work(&pit->expired);
ps->period = interval;
ps->is_periodic = is_period;
@@ -667,13 +667,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pid_nr = pid_vnr(pid);
put_pid(pid);
- init_kthread_worker(&pit->worker);
+ kthread_init_worker(&pit->worker);
pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
"kvm-pit/%d", pid_nr);
if (IS_ERR(pit->worker_task))
goto fail_kthread;
- init_kthread_work(&pit->expired, pit_do_work);
+ kthread_init_work(&pit->expired, pit_do_work);
pit->kvm = kvm;
@@ -730,7 +730,7 @@ void kvm_free_pit(struct kvm *kvm)
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
- flush_kthread_work(&pit->expired);
+ kthread_flush_work(&pit->expired);
kthread_stop(pit->worker_task);
kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index c7220ba94aa7..1a22de70f7f7 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -594,7 +594,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
ioapic->irr = 0;
ioapic->irr_delivered = 0;
ioapic->id = 0;
- memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
+ memset(ioapic->irq_eoi, 0x00, sizeof(ioapic->irq_eoi));
rtc_irq_eoi_tracking_reset(ioapic);
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b62c85229711..23b99f305382 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if (value & MSR_IA32_APICBASE_ENABLE) {
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
static_key_slow_dec_deferred(&apic_hw_disabled);
- } else
+ } else {
static_key_slow_inc(&apic_hw_disabled.key);
- recalculate_apic_map(vcpu->kvm);
+ recalculate_apic_map(vcpu->kvm);
+ }
}
if ((old_value ^ value) & X2APIC_ENABLE) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3d4cc8cc56a3..d9c7e986b4e4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
*
* Return true if tlb need be flushed.
*/
-static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
+static bool spte_write_protect(u64 *sptep, bool pt_protect)
{
u64 spte = *sptep;
@@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm,
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_write_protect(kvm, sptep, pt_protect);
+ flush |= spte_write_protect(sptep, pt_protect);
return flush;
}
-static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
@@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_clear_dirty(kvm, sptep);
+ flush |= spte_clear_dirty(sptep);
return flush;
}
-static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+static bool spte_set_dirty(u64 *sptep)
{
u64 spte = *sptep;
@@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- flush |= spte_set_dirty(kvm, sptep);
+ flush |= spte_set_dirty(sptep);
return flush;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af523d84d102..f8157a36ab09 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -34,6 +34,8 @@
#include <linux/sched.h>
#include <linux/trace_events.h>
#include <linux/slab.h>
+#include <linux/amd-iommu.h>
+#include <linux/hashtable.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
@@ -41,6 +43,7 @@
#include <asm/desc.h>
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
+#include <asm/irq_remapping.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0
#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF
+/* AVIC GATAG is encoded using VM and VCPU IDs */
+#define AVIC_VCPU_ID_BITS 8
+#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1)
+
+#define AVIC_VM_ID_BITS 24
+#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS)
+#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1)
+
+#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
+ (y & AVIC_VCPU_ID_MASK))
+#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
+#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK)
+
static bool erratum_383_found __read_mostly;
static const u32 host_save_user_msrs[] = {
@@ -185,6 +201,23 @@ struct vcpu_svm {
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
bool avic_is_running;
+
+ /*
+ * Per-vcpu list of struct amd_svm_iommu_ir:
+ * This is used mainly to store interrupt remapping information used
+ * when update the vcpu affinity. This avoids the need to scan for
+ * IRTE and try to match ga_tag in the IOMMU driver.
+ */
+ struct list_head ir_list;
+ spinlock_t ir_list_lock;
+};
+
+/*
+ * This is a wrapper of struct amd_iommu_ir_data.
+ */
+struct amd_svm_iommu_ir {
+ struct list_head node; /* Used by SVM for per-vcpu ir_list */
+ void *data; /* Storing pointer to struct amd_ir_data */
};
#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
@@ -242,6 +275,10 @@ static int avic;
module_param(avic, int, S_IRUGO);
#endif
+/* AVIC VM ID bit masks and lock */
+static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
+static DEFINE_SPINLOCK(avic_vm_id_lock);
+
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
+/* Note:
+ * This hash table is used to map VM_ID to a struct kvm_arch,
+ * when handling AMD IOMMU GALOG notification to schedule in
+ * a particular vCPU.
+ */
+#define SVM_VM_DATA_HASH_BITS 8
+DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static spinlock_t svm_vm_data_hash_lock;
+
+/* Note:
+ * This function is called from IOMMU driver to notify
+ * SVM to schedule in a particular vCPU of a particular VM.
+ */
+static int avic_ga_log_notifier(u32 ga_tag)
+{
+ unsigned long flags;
+ struct kvm_arch *ka = NULL;
+ struct kvm_vcpu *vcpu = NULL;
+ u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
+
+ pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+ struct kvm *kvm = container_of(ka, struct kvm, arch);
+ struct kvm_arch *vm_data = &kvm->arch;
+
+ if (vm_data->avic_vm_id != vm_id)
+ continue;
+ vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ break;
+ }
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
+ if (!vcpu)
+ return 0;
+
+ /* Note:
+ * At this point, the IOMMU should have already set the pending
+ * bit in the vAPIC backing page. So, we just need to schedule
+ * in the vcpu.
+ */
+ if (vcpu->mode == OUTSIDE_GUEST_MODE)
+ kvm_vcpu_wake_up(vcpu);
+
+ return 0;
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void)
if (avic) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_AVIC) ||
- !IS_ENABLED(CONFIG_X86_LOCAL_APIC))
+ !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
avic = false;
- else
+ } else {
pr_info("AVIC enabled\n");
+
+ hash_init(svm_vm_data_hash);
+ spin_lock_init(&svm_vm_data_hash_lock);
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ }
}
return 0;
@@ -1028,13 +1119,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- return svm->vmcb->control.tsc_offset;
-}
-
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1280,19 +1364,55 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
}
+static inline int avic_get_next_vm_id(void)
+{
+ int id;
+
+ spin_lock(&avic_vm_id_lock);
+
+ /* AVIC VM ID is one-based. */
+ id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
+ if (id <= AVIC_VM_ID_MASK)
+ __set_bit(id, avic_vm_id_bitmap);
+ else
+ id = -EAGAIN;
+
+ spin_unlock(&avic_vm_id_lock);
+ return id;
+}
+
+static inline int avic_free_vm_id(int id)
+{
+ if (id <= 0 || id > AVIC_VM_ID_MASK)
+ return -EINVAL;
+
+ spin_lock(&avic_vm_id_lock);
+ __clear_bit(id, avic_vm_id_bitmap);
+ spin_unlock(&avic_vm_id_lock);
+ return 0;
+}
+
static void avic_vm_destroy(struct kvm *kvm)
{
+ unsigned long flags;
struct kvm_arch *vm_data = &kvm->arch;
+ avic_free_vm_id(vm_data->avic_vm_id);
+
if (vm_data->avic_logical_id_table_page)
__free_page(vm_data->avic_logical_id_table_page);
if (vm_data->avic_physical_id_table_page)
__free_page(vm_data->avic_physical_id_table_page);
+
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_del(&vm_data->hnode);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
}
static int avic_vm_init(struct kvm *kvm)
{
- int err = -ENOMEM;
+ unsigned long flags;
+ int vm_id, err = -ENOMEM;
struct kvm_arch *vm_data = &kvm->arch;
struct page *p_page;
struct page *l_page;
@@ -1300,6 +1420,11 @@ static int avic_vm_init(struct kvm *kvm)
if (!avic)
return 0;
+ vm_id = avic_get_next_vm_id();
+ if (vm_id < 0)
+ return vm_id;
+ vm_data->avic_vm_id = (u32)vm_id;
+
/* Allocating physical APIC ID table (4KB) */
p_page = alloc_page(GFP_KERNEL);
if (!p_page)
@@ -1316,6 +1441,10 @@ static int avic_vm_init(struct kvm *kvm)
vm_data->avic_logical_id_table_page = l_page;
clear_page(page_address(l_page));
+ spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
+ spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+
return 0;
free_avic:
@@ -1323,31 +1452,34 @@ free_avic:
return err;
}
-/**
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+static inline int
+avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
- u64 entry;
- int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu);
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
- svm->avic_is_running = is_run;
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
- /* ID = 0xff (broadcast), ID > 0xff (reserved) */
- if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
- return;
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
- entry = READ_ONCE(*(svm->avic_physical_id_cache));
- WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK));
+ if (list_empty(&svm->ir_list))
+ goto out;
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- if (is_run)
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ ret = amd_iommu_update_ga(cpu, r, ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
}
static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1374,6 +1506,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
+ svm->avic_is_running);
}
static void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1385,10 +1519,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu)
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
+/**
+ * This function is called during VCPU halt/unhalt.
+ */
+static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ svm->avic_is_running = is_run;
+ if (is_run)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+}
+
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1450,6 +1601,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
err = avic_init_backing_page(&svm->vcpu);
if (err)
goto free_page4;
+
+ INIT_LIST_HEAD(&svm->ir_list);
+ spin_lock_init(&svm->ir_list_lock);
}
/* We initialize this flag to true to make sure that the is_running
@@ -4246,6 +4400,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
kvm_vcpu_wake_up(vcpu);
}
+static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ unsigned long flags;
+ struct amd_svm_iommu_ir *cur;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_for_each_entry(cur, &svm->ir_list, node) {
+ if (cur->data != pi->ir_data)
+ continue;
+ list_del(&cur->node);
+ kfree(cur);
+ break;
+ }
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+}
+
+static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+
+ /**
+ * In some cases, the existing irte is updaed and re-set,
+ * so we need to check here if it's already been * added
+ * to the ir_list.
+ */
+ if (pi->ir_data && (pi->prev_ga_tag != 0)) {
+ struct kvm *kvm = svm->vcpu.kvm;
+ u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
+ struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
+ struct vcpu_svm *prev_svm;
+
+ if (!prev_vcpu) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ prev_svm = to_svm(prev_vcpu);
+ svm_ir_list_del(prev_svm, pi);
+ }
+
+ /**
+ * Allocating new amd_iommu_pi_data, which will get
+ * add to the per-vcpu ir_list.
+ */
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+ if (!ir) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ir->data = pi->ir_data;
+
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+ list_add(&ir->node, &svm->ir_list);
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * Note:
+ * The HW cannot support posting multicast/broadcast
+ * interrupts to a vCPU. So, we still use legacy interrupt
+ * remapping for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ */
+static int
+get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
+ struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
+{
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu = NULL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
+
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
+ __func__, irq.vector);
+ return -1;
+ }
+
+ pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
+ irq.vector);
+ *svm = to_svm(vcpu);
+ vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+ vcpu_info->vector = irq.vector;
+
+ return 0;
+}
+
+/*
+ * svm_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ int idx, ret = -EINVAL;
+
+ if (!kvm_arch_has_assigned_device(kvm) ||
+ !irq_remapping_cap(IRQ_POSTING_CAP))
+ return 0;
+
+ pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
+ __func__, host_irq, guest_irq, set);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ struct vcpu_data vcpu_info;
+ struct vcpu_svm *svm = NULL;
+
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+
+ /**
+ * Here, we setup with legacy mode in the following cases:
+ * 1. When cannot target interrupt to a specific vcpu.
+ * 2. Unsetting posted interrupt.
+ * 3. APIC virtialization is disabled for the vcpu.
+ */
+ if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
+ kvm_vcpu_apicv_active(&svm->vcpu)) {
+ struct amd_iommu_pi_data pi;
+
+ /* Try to enable guest_mode in IRTE */
+ pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+ pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
+ svm->vcpu.vcpu_id);
+ pi.is_guest_mode = true;
+ pi.vcpu_data = &vcpu_info;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Here, we successfully setting up vcpu affinity in
+ * IOMMU guest mode. Now, we need to store the posted
+ * interrupt information in a per-vcpu ir_list so that
+ * we can reference to them directly when we update vcpu
+ * scheduling information in IOMMU irte.
+ */
+ if (!ret && pi.is_guest_mode)
+ svm_ir_list_add(svm, &pi);
+ } else {
+ /* Use legacy mode in IRTE */
+ struct amd_iommu_pi_data pi;
+
+ /**
+ * Here, pi is used to:
+ * - Tell IOMMU to use legacy mode for this interrupt.
+ * - Retrieve ga_tag of prior interrupt remapping data.
+ */
+ pi.is_guest_mode = false;
+ ret = irq_set_vcpu_affinity(host_irq, &pi);
+
+ /**
+ * Check if the posted interrupt was previously
+ * setup with the guest_mode by checking if the ga_tag
+ * was cached. If so, we need to clean up the per-vcpu
+ * ir_list.
+ */
+ if (!ret && pi.prev_ga_tag) {
+ int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu_by_id(kvm, id);
+ if (vcpu)
+ svm_ir_list_del(to_svm(vcpu), &pi);
+ }
+ }
+
+ if (!ret && svm) {
+ trace_kvm_pi_irte_update(svm->vcpu.vcpu_id,
+ host_irq, e->gsi,
+ vcpu_info.vector,
+ vcpu_info.pi_desc_addr, set);
+ }
+
+ if (ret < 0) {
+ pr_err("%s: failed to update PI IRTE\n", __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4961,7 +5318,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-static struct kvm_x86_ops svm_x86_ops = {
+static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
.hardware_setup = svm_hardware_setup,
@@ -5064,7 +5421,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
- .read_tsc_offset = svm_read_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
.read_l1_tsc = svm_read_l1_tsc,
@@ -5078,6 +5434,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.pmu_ops = &amd_pmu_ops,
.deliver_posted_interrupt = svm_deliver_avic_intr,
+ .update_pi_irte = svm_update_pi_irte,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5cede40e2552..cf1b16dbc98a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -927,6 +927,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static unsigned long *vmx_msr_bitmap_legacy_x2apic;
static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
static unsigned long *vmx_vmread_bitmap;
static unsigned long *vmx_vmwrite_bitmap;
@@ -939,6 +941,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock);
static struct vmcs_config {
int size;
int order;
+ u32 basic_cap;
u32 revision_id;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
@@ -1215,6 +1218,11 @@ static inline bool cpu_has_vmx_ple(void)
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
+static inline bool cpu_has_vmx_basic_inout(void)
+{
+ return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+}
+
static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
return flexpriority_enabled && lapic_in_kernel(vcpu);
@@ -2518,10 +2526,17 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
else if (cpu_has_secondary_exec_ctrls() &&
(vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
- else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ } else {
+ if (is_long_mode(vcpu))
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+ else
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+ }
} else {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode;
@@ -2603,11 +2618,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
return host_tsc + tsc_offset;
}
-static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
- return vmcs_read64(TSC_OFFSET);
-}
-
/*
* writes 'offset' into guest's timestamp counter offset register
*/
@@ -2877,6 +2887,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+ if (cpu_has_vmx_basic_inout())
+ *pdata |= VMX_BASIC_INOUT;
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
@@ -3457,7 +3469,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_config.size);
+ vmcs_conf->order = get_order(vmcs_conf->size);
+ vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
@@ -4678,28 +4691,49 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
msr, MSR_TYPE_R | MSR_TYPE_W);
}
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
{
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
+ if (apicv_active) {
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+ } else {
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ }
}
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
{
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
+ if (apicv_active) {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_R);
+ } else {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_R);
+ }
}
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
+static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
{
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_W);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_W);
+ if (apicv_active) {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
+ msr, MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
+ msr, MSR_TYPE_W);
+ } else {
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ msr, MSR_TYPE_W);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ msr, MSR_TYPE_W);
+ }
}
static bool vmx_get_enable_apicv(void)
@@ -5279,29 +5313,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (is_guest_mode(vcpu))
- return;
+ if (!is_guest_mode(vcpu)) {
+ if (!cpu_has_virtual_nmis()) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->soft_vnmi_blocked = 1;
+ vmx->vnmi_blocked_time = 0;
+ }
- if (!cpu_has_virtual_nmis()) {
- /*
- * Tracking the NMI-blocked state in software is built upon
- * finding the next open IRQ window. This, in turn, depends on
- * well-behaving guests: They have to keep IRQs disabled at
- * least as long as the NMI handler runs. Otherwise we may
- * cause NMI nesting, maybe breaking the guest. But as this is
- * highly unlikely, we can live with the residual risk.
- */
- vmx->soft_vnmi_blocked = 1;
- vmx->vnmi_blocked_time = 0;
+ ++vcpu->stat.nmi_injections;
+ vmx->nmi_known_unmasked = false;
}
- ++vcpu->stat.nmi_injections;
- vmx->nmi_known_unmasked = false;
if (vmx->rmode.vm86_active) {
if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
+
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}
@@ -6109,7 +6144,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
gla_validity = (exit_qualification >> 7) & 0x3;
- if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
+ if (gla_validity == 0x2) {
printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
@@ -6360,22 +6395,32 @@ static __init int hardware_setup(void)
if (!vmx_msr_bitmap_legacy_x2apic)
goto out2;
+ vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
+ goto out3;
+
vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode)
- goto out3;
+ goto out4;
vmx_msr_bitmap_longmode_x2apic =
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
- goto out4;
+ goto out5;
+
+ vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
+ goto out6;
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
- goto out6;
+ goto out7;
vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
- goto out7;
+ goto out8;
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6394,7 +6439,7 @@ static __init int hardware_setup(void)
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
- goto out8;
+ goto out9;
}
if (boot_cpu_has(X86_FEATURE_NX))
@@ -6461,20 +6506,35 @@ static __init int hardware_setup(void)
vmx_msr_bitmap_legacy, PAGE_SIZE);
memcpy(vmx_msr_bitmap_longmode_x2apic,
vmx_msr_bitmap_longmode, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ vmx_msr_bitmap_legacy, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ vmx_msr_bitmap_longmode, PAGE_SIZE);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+ /*
+ * enable_apicv && kvm_vcpu_apicv_active()
+ */
for (msr = 0x800; msr <= 0x8ff; msr++)
- vmx_disable_intercept_msr_read_x2apic(msr);
+ vmx_disable_intercept_msr_read_x2apic(msr, true);
/* TMCCT */
- vmx_enable_intercept_msr_read_x2apic(0x839);
+ vmx_enable_intercept_msr_read_x2apic(0x839, true);
/* TPR */
- vmx_disable_intercept_msr_write_x2apic(0x808);
+ vmx_disable_intercept_msr_write_x2apic(0x808, true);
/* EOI */
- vmx_disable_intercept_msr_write_x2apic(0x80b);
+ vmx_disable_intercept_msr_write_x2apic(0x80b, true);
/* SELF-IPI */
- vmx_disable_intercept_msr_write_x2apic(0x83f);
+ vmx_disable_intercept_msr_write_x2apic(0x83f, true);
+
+ /*
+ * (enable_apicv && !kvm_vcpu_apicv_active()) ||
+ * !enable_apicv
+ */
+ /* TPR */
+ vmx_disable_intercept_msr_read_x2apic(0x808, false);
+ vmx_disable_intercept_msr_write_x2apic(0x808, false);
if (enable_ept) {
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6521,14 +6581,18 @@ static __init int hardware_setup(void)
return alloc_kvm_area();
-out8:
+out9:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out7:
+out8:
free_page((unsigned long)vmx_vmread_bitmap);
+out7:
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
out6:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
+out5:
free_page((unsigned long)vmx_msr_bitmap_longmode);
+out4:
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
out3:
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
out2:
@@ -6544,7 +6608,9 @@ out:
static __exit void hardware_unsetup(void)
{
free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
free_page((unsigned long)vmx_msr_bitmap_legacy);
free_page((unsigned long)vmx_msr_bitmap_longmode);
free_page((unsigned long)vmx_io_bitmap_b);
@@ -6726,7 +6792,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+ pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
}
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
@@ -7013,7 +7079,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
vmx->nested.vmcs02_num = 0;
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
+ HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
vmx->nested.vmxon = true;
@@ -8435,12 +8501,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
- /*
- * There is not point to enable virtualize x2apic without enable
- * apicv
- */
- if (!cpu_has_vmx_virtualize_x2apic_mode() ||
- !kvm_vcpu_apicv_active(vcpu))
+ if (!cpu_has_vmx_virtualize_x2apic_mode())
return;
if (!cpu_need_tpr_shadow(vcpu))
@@ -9598,7 +9659,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
maxphyaddr = cpuid_maxphyaddr(vcpu);
if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
(addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
addr_field, maxphyaddr, count, addr);
return -EINVAL;
@@ -9671,13 +9732,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
for (i = 0; i < count; i++) {
if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
&e, sizeof(e))) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e));
goto fail;
}
if (nested_vmx_load_msr_check(vcpu, &e)) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s check failed (%u, 0x%x, 0x%x)\n",
__func__, i, e.index, e.reserved);
goto fail;
@@ -9685,7 +9746,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
msr.index = e.index;
msr.data = e.value;
if (kvm_set_msr(vcpu, &msr)) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
goto fail;
@@ -9706,13 +9767,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
if (kvm_vcpu_read_guest(vcpu,
gpa + i * sizeof(e),
&e, 2 * sizeof(u32))) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e));
return -EINVAL;
}
if (nested_vmx_store_msr_check(vcpu, &e)) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s check failed (%u, 0x%x, 0x%x)\n",
__func__, i, e.index, e.reserved);
return -EINVAL;
@@ -9720,7 +9781,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
msr_info.host_initiated = false;
msr_info.index = e.index;
if (kvm_get_msr(vcpu, &msr_info)) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot read MSR (%u, 0x%x)\n",
__func__, i, e.index);
return -EINVAL;
@@ -9729,7 +9790,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
gpa + i * sizeof(e) +
offsetof(struct vmx_msr_entry, value),
&msr_info.data, sizeof(msr_info.data))) {
- pr_warn_ratelimited(
+ pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, msr_info.data);
return -EINVAL;
@@ -10500,6 +10561,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
}
+ if (nested_cpu_has_ept(vmcs12))
+ vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
+
if (nested_cpu_has_vid(vmcs12))
vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
@@ -10793,7 +10857,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
* We are now running in L2, mmu_notifier will force to reload the
* page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
*/
- kvm_vcpu_reload_apic_access_page(vcpu);
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
/*
* Exiting from L2 to L1, we're now back to L1 which thinks it just
@@ -11177,7 +11241,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEATURE_CONTROL_LMCE;
}
-static struct kvm_x86_ops vmx_x86_ops = {
+static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
.hardware_setup = hardware_setup,
@@ -11274,7 +11338,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
- .read_tsc_offset = vmx_read_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
.read_l1_tsc = vmx_read_l1_tsc,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 699f8726539a..e375235d81c9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1367,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
{
- u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+ u64 curr_offset = vcpu->arch.tsc_offset;
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
@@ -1413,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_offset = offset;
+}
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1425,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = get_kernel_ns();
+ ns = ktime_get_boot_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
@@ -1522,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
update_ia32_tsc_adjust_msr(vcpu, offset);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1716,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
#endif
}
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+ struct kvm_arch *ka = &kvm->arch;
+ s64 ns;
+
+ if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+ u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+ } else {
+ ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+ }
+
+ return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ unsigned long flags;
+ s64 ns;
+
+ local_irq_save(flags);
+ ns = __get_kvmclock_ns(kvm);
+ local_irq_restore(flags);
+
+ return ns;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct pvclock_vcpu_time_info guest_hv_clock;
+
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return;
+
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
+ */
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+}
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
@@ -1723,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
- struct pvclock_vcpu_time_info guest_hv_clock;
u8 pvclock_flags;
bool use_master_clock;
@@ -1752,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = get_kernel_ns();
+ kernel_ns = ktime_get_boot_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -1777,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_restore(flags);
- if (!vcpu->pv_time_enabled)
- return 0;
+ /* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
@@ -1790,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
- /* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
- return 0;
-
- /* This VCPU is paused, but it's legal for a guest to read another
- * VCPU's kvmclock, so we really have to follow the specification where
- * it says that version is odd if data is being modified, and even after
- * it is consistent.
- *
- * Version field updates must be kept separate. This is because
- * kvm_write_guest_cached might use a "rep movs" instruction, and
- * writes within a string instruction are weakly ordered. So there
- * are three writes overall.
- *
- * As a small optimization, only write the version field in the first
- * and third write. The vcpu->pv_time cache is still valid, because the
- * version field is the first in the struct.
- */
- BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
- vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
-
- smp_wmb();
-
- /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
- if (vcpu->pvclock_set_guest_stopped_request) {
- pvclock_flags |= PVCLOCK_GUEST_STOPPED;
- vcpu->pvclock_set_guest_stopped_request = false;
- }
-
/* If the host uses TSC clocksource, then it is stable */
+ pvclock_flags = 0;
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
vcpu->hv_clock.flags = pvclock_flags;
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
-
- smp_wmb();
-
- vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ if (vcpu->pv_time_enabled)
+ kvm_setup_pvclock_page(v);
+ if (v == kvm_get_vcpu(v->kvm, 0))
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -2746,7 +2789,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
if (kvm_lapic_hv_timer_in_use(vcpu) &&
@@ -4039,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_CLOCK: {
struct kvm_clock_data user_ns;
u64 now_ns;
- s64 delta;
r = -EFAULT;
if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4051,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
local_irq_disable();
- now_ns = get_kernel_ns();
- delta = user_ns.clock - now_ns;
+ now_ns = __get_kvmclock_ns(kvm);
+ kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
local_irq_enable();
- kvm->arch.kvmclock_offset = delta;
kvm_gen_update_masterclock(kvm);
break;
}
@@ -4062,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
struct kvm_clock_data user_ns;
u64 now_ns;
- local_irq_disable();
- now_ns = get_kernel_ns();
- user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
- local_irq_enable();
+ now_ns = get_kvmclock_ns(kvm);
+ user_ns.clock = now_ns;
user_ns.flags = 0;
memset(&user_ns.pad, 0, sizeof(user_ns.pad));
@@ -5694,13 +5733,13 @@ static int kvmclock_cpu_online(unsigned int cpu)
static void kvm_timer_init(void)
{
- int cpu;
-
max_tsc_khz = tsc_khz;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
+ int cpu;
+
memset(&policy, 0, sizeof(policy));
cpu = get_cpu();
cpufreq_get_policy(&policy, cpu);
@@ -6700,7 +6739,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_put_guest_xcr0(vcpu);
- /* Interrupt is enabled by handle_external_intr() */
kvm_x86_ops->handle_external_intr(vcpu);
++vcpu->stat.exits;
@@ -7530,7 +7568,7 @@ int kvm_arch_hardware_enable(void)
* before any KVM threads can be running. Unfortunately, we can't
* bring the TSCs fully up to date with real time, as we aren't yet far
* enough into CPU bringup that we know how much real time has actually
- * elapsed; our helper function, get_kernel_ns() will be using boot
+ * elapsed; our helper function, ktime_get_boot_ns() will be using boot
* variables that haven't been updated yet.
*
* So we simply find the maximum observed TSC above, then record the
@@ -7765,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+ kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
pvclock_update_vm_gtod_copy(kvm);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a82ca466b62e..e8ff3e4ce38a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
return kvm_register_write(vcpu, reg, val);
}
-static inline u64 get_kernel_ns(void)
-{
- return ktime_get_boot_ns();
-}
-
static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
{
return !(kvm->arch.disabled_quirks & quirk);
@@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index c1e623209853..4d34bb548b41 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -28,6 +28,7 @@
#include <linux/linkage.h>
#include <asm/errno.h>
#include <asm/asm.h>
+#include <asm/export.h>
/*
* computes a partial checksum, e.g. for TCP/UDP fragments
@@ -251,6 +252,7 @@ ENTRY(csum_partial)
ENDPROC(csum_partial)
#endif
+EXPORT_SYMBOL(csum_partial)
/*
unsigned int csum_partial_copy_generic (const char *src, char *dst,
@@ -490,3 +492,4 @@ ENDPROC(csum_partial_copy_generic)
#undef ROUND1
#endif
+EXPORT_SYMBOL(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 65be7cfaf947..5e2af3a88cf5 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,6 +1,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
+#include <asm/export.h>
/*
* Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
@@ -23,6 +24,7 @@ ENTRY(clear_page)
rep stosq
ret
ENDPROC(clear_page)
+EXPORT_SYMBOL(clear_page)
ENTRY(clear_page_orig)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index ad5349778490..03a186fc06ea 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -7,6 +7,7 @@
*/
#include <linux/linkage.h>
+#include <asm/export.h>
.text
@@ -48,3 +49,4 @@ ENTRY(cmpxchg8b_emu)
ret
ENDPROC(cmpxchg8b_emu)
+EXPORT_SYMBOL(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 24ef1c2104d4..e8508156c99d 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -3,6 +3,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
+#include <asm/export.h>
/*
* Some CPUs run faster using the string copy instructions (sane microcode).
@@ -17,6 +18,7 @@ ENTRY(copy_page)
rep movsq
ret
ENDPROC(copy_page)
+EXPORT_SYMBOL(copy_page)
ENTRY(copy_page_regs)
subq $2*8, %rsp
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index bf603ebbfd8e..d376e4b48f88 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -14,6 +14,7 @@
#include <asm/alternative-asm.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/export.h>
/* Standard copy_to_user with segment limit checking */
ENTRY(_copy_to_user)
@@ -29,6 +30,7 @@ ENTRY(_copy_to_user)
"jmp copy_user_enhanced_fast_string", \
X86_FEATURE_ERMS
ENDPROC(_copy_to_user)
+EXPORT_SYMBOL(_copy_to_user)
/* Standard copy_from_user with segment limit checking */
ENTRY(_copy_from_user)
@@ -44,6 +46,8 @@ ENTRY(_copy_from_user)
"jmp copy_user_enhanced_fast_string", \
X86_FEATURE_ERMS
ENDPROC(_copy_from_user)
+EXPORT_SYMBOL(_copy_from_user)
+
.section .fixup,"ax"
/* must zero dest */
@@ -155,6 +159,7 @@ ENTRY(copy_user_generic_unrolled)
_ASM_EXTABLE(21b,50b)
_ASM_EXTABLE(22b,50b)
ENDPROC(copy_user_generic_unrolled)
+EXPORT_SYMBOL(copy_user_generic_unrolled)
/* Some CPUs run faster using the string copy instructions.
* This is also a lot simpler. Use them when possible.
@@ -200,6 +205,7 @@ ENTRY(copy_user_generic_string)
_ASM_EXTABLE(1b,11b)
_ASM_EXTABLE(3b,12b)
ENDPROC(copy_user_generic_string)
+EXPORT_SYMBOL(copy_user_generic_string)
/*
* Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
@@ -229,6 +235,7 @@ ENTRY(copy_user_enhanced_fast_string)
_ASM_EXTABLE(1b,12b)
ENDPROC(copy_user_enhanced_fast_string)
+EXPORT_SYMBOL(copy_user_enhanced_fast_string)
/*
* copy_user_nocache - Uncached memory copy with exception handling
@@ -379,3 +386,4 @@ ENTRY(__copy_user_nocache)
_ASM_EXTABLE(40b,.L_fixup_1b_copy)
_ASM_EXTABLE(41b,.L_fixup_1b_copy)
ENDPROC(__copy_user_nocache)
+EXPORT_SYMBOL(__copy_user_nocache)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index 9a7fe6a70491..378e5d5bf9b1 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -135,6 +135,7 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
return (__force __wsum)add32_with_carry(do_csum(buff, len),
(__force u32)sum);
}
+EXPORT_SYMBOL(csum_partial);
/*
* this routine is used for miscellaneous IP-like checksums, mainly
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 0ef5128c2de8..37b62d412148 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -32,6 +32,7 @@
#include <asm/thread_info.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/export.h>
.text
ENTRY(__get_user_1)
@@ -44,6 +45,7 @@ ENTRY(__get_user_1)
ASM_CLAC
ret
ENDPROC(__get_user_1)
+EXPORT_SYMBOL(__get_user_1)
ENTRY(__get_user_2)
add $1,%_ASM_AX
@@ -57,6 +59,7 @@ ENTRY(__get_user_2)
ASM_CLAC
ret
ENDPROC(__get_user_2)
+EXPORT_SYMBOL(__get_user_2)
ENTRY(__get_user_4)
add $3,%_ASM_AX
@@ -70,6 +73,7 @@ ENTRY(__get_user_4)
ASM_CLAC
ret
ENDPROC(__get_user_4)
+EXPORT_SYMBOL(__get_user_4)
ENTRY(__get_user_8)
#ifdef CONFIG_X86_64
@@ -97,6 +101,7 @@ ENTRY(__get_user_8)
ret
#endif
ENDPROC(__get_user_8)
+EXPORT_SYMBOL(__get_user_8)
bad_get_user:
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
index 8a602a1e404a..23d893cbc200 100644
--- a/arch/x86/lib/hweight.S
+++ b/arch/x86/lib/hweight.S
@@ -1,4 +1,5 @@
#include <linux/linkage.h>
+#include <asm/export.h>
#include <asm/asm.h>
@@ -32,6 +33,7 @@ ENTRY(__sw_hweight32)
__ASM_SIZE(pop,) %__ASM_REG(dx)
ret
ENDPROC(__sw_hweight32)
+EXPORT_SYMBOL(__sw_hweight32)
ENTRY(__sw_hweight64)
#ifdef CONFIG_X86_64
@@ -77,3 +79,4 @@ ENTRY(__sw_hweight64)
ret
#endif
ENDPROC(__sw_hweight64)
+EXPORT_SYMBOL(__sw_hweight64)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 2ec0b0abbfaa..779782f58324 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
#include <asm/errno.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
+#include <asm/export.h>
/*
* We build a jump to memcpy_orig by default which gets NOPped out on
@@ -40,6 +41,8 @@ ENTRY(memcpy)
ret
ENDPROC(memcpy)
ENDPROC(__memcpy)
+EXPORT_SYMBOL(memcpy)
+EXPORT_SYMBOL(__memcpy)
/*
* memcpy_erms() - enhanced fast string memcpy. This is faster and
@@ -181,11 +184,11 @@ ENDPROC(memcpy_orig)
#ifndef CONFIG_UML
/*
- * memcpy_mcsafe - memory copy with machine check exception handling
+ * memcpy_mcsafe_unrolled - memory copy with machine check exception handling
* Note that we only catch machine checks when reading the source addresses.
* Writes to target are posted and don't generate machine checks.
*/
-ENTRY(memcpy_mcsafe)
+ENTRY(memcpy_mcsafe_unrolled)
cmpl $8, %edx
/* Less than 8 bytes? Go to byte copy loop */
jb .L_no_whole_words
@@ -273,7 +276,8 @@ ENTRY(memcpy_mcsafe)
.L_done_memcpy_trap:
xorq %rax, %rax
ret
-ENDPROC(memcpy_mcsafe)
+ENDPROC(memcpy_mcsafe_unrolled)
+EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled)
.section .fixup, "ax"
/* Return -EFAULT for any failure */
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 90ce01bee00c..15de86cd15b0 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
+#include <asm/export.h>
#undef memmove
@@ -207,3 +208,5 @@ ENTRY(__memmove)
retq
ENDPROC(__memmove)
ENDPROC(memmove)
+EXPORT_SYMBOL(__memmove)
+EXPORT_SYMBOL(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index e1229ecd2a82..55b95db30a61 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -3,6 +3,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
+#include <asm/export.h>
.weak memset
@@ -43,6 +44,8 @@ ENTRY(__memset)
ret
ENDPROC(memset)
ENDPROC(__memset)
+EXPORT_SYMBOL(memset)
+EXPORT_SYMBOL(__memset)
/*
* ISO C memset - set a memory block to a byte value. This function uses
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index c891ece81e5b..cd5d716d2897 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -15,6 +15,7 @@
#include <asm/errno.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/export.h>
/*
@@ -43,6 +44,7 @@ ENTRY(__put_user_1)
xor %eax,%eax
EXIT
ENDPROC(__put_user_1)
+EXPORT_SYMBOL(__put_user_1)
ENTRY(__put_user_2)
ENTER
@@ -55,6 +57,7 @@ ENTRY(__put_user_2)
xor %eax,%eax
EXIT
ENDPROC(__put_user_2)
+EXPORT_SYMBOL(__put_user_2)
ENTRY(__put_user_4)
ENTER
@@ -67,6 +70,7 @@ ENTRY(__put_user_4)
xor %eax,%eax
EXIT
ENDPROC(__put_user_4)
+EXPORT_SYMBOL(__put_user_4)
ENTRY(__put_user_8)
ENTER
@@ -82,6 +86,7 @@ ENTRY(__put_user_8)
xor %eax,%eax
EXIT
ENDPROC(__put_user_8)
+EXPORT_SYMBOL(__put_user_8)
bad_put_user:
movl $-EFAULT,%eax
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 8e2d55f754bf..a03b1c750bfe 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -1,4 +1,5 @@
#include <linux/string.h>
+#include <linux/export.h>
char *strstr(const char *cs, const char *ct)
{
@@ -28,4 +29,4 @@ __asm__ __volatile__(
: "dx", "di");
return __res;
}
-
+EXPORT_SYMBOL(strstr);
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index ba47524f56e8..d1c7de095808 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -52,21 +52,6 @@ static __init int find_northbridge(void)
return -ENOENT;
}
-static __init void early_get_boot_cpu_id(void)
-{
- /*
- * need to get the APIC ID of the BSP so can use that to
- * create apicid_to_node in amd_scan_nodes()
- */
-#ifdef CONFIG_X86_MPPARSE
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- early_get_smp_config();
-#endif
-}
-
int __init amd_numa_init(void)
{
u64 start = PFN_PHYS(0);
@@ -180,8 +165,11 @@ int __init amd_numa_init(void)
cores = 1 << bits;
apicid_base = 0;
- /* get the APIC ID of the BSP early for systems with apicid lifting */
- early_get_boot_cpu_id();
+ /*
+ * get boot-time SMP configuration:
+ */
+ early_get_smp_config();
+
if (boot_cpu_physical_apicid > 0) {
pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
apicid_base = boot_cpu_physical_apicid;
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 832b98f822be..79ae939970d3 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/extable.h>
#include <asm/uaccess.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index dc8023060456..9f72ca3b2669 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -5,7 +5,7 @@
*/
#include <linux/sched.h> /* test_thread_flag(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
-#include <linux/module.h> /* search_exception_table */
+#include <linux/extable.h> /* search_exception_tables */
#include <linux/bootmem.h> /* max_low_pfn */
#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
@@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code,
return;
}
+#ifdef CONFIG_VMAP_STACK
+ /*
+ * Stack overflow? During boot, we can fault near the initial
+ * stack in the direct map, but that's not an overflow -- check
+ * that we're in vmalloc space to avoid this.
+ */
+ if (is_vmalloc_addr((void *)address) &&
+ (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
+ address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ register void *__sp asm("rsp");
+ unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
+ /*
+ * We're likely to be running with very little stack space
+ * left. It's plausible that we'd hit this condition but
+ * double-fault even before we get this far, in which case
+ * we're fine: the double-fault handler will deal with it.
+ *
+ * We don't want to make it all the way into the oops code
+ * and then double-fault, though, because we're likely to
+ * break the console driver and lose most of the stack dump.
+ */
+ asm volatile ("movq %[stack], %%rsp\n\t"
+ "call handle_stack_overflow\n\t"
+ "1: jmp 1b"
+ : "+r" (__sp)
+ : "D" ("kernel stack overflow (page fault)"),
+ "S" (regs), "d" (address),
+ [stack] "rm" (stack));
+ unreachable();
+ }
+#endif
+
/*
* 32-bit:
*
@@ -1112,6 +1144,15 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
{
/* This is only called for the current mm, so: */
bool foreign = false;
+
+ /*
+ * Read or write was blocked by protection keys. This is
+ * always an unconditional error and can never result in
+ * a follow-up action to resolve the fault, like a COW.
+ */
+ if (error_code & PF_PK)
+ return 1;
+
/*
* Make sure to check the VMA so that we do not perform
* faults just to hit a PF_PK as soon as we fill in a
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index b8b6a60b32cf..0d4fb3ebbbac 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -435,7 +435,7 @@ slow_irqon:
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT,
- write, 0, pages);
+ pages, write ? FOLL_WRITE : 0);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d28a2d741f9e..22af912d66d2 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -699,8 +699,10 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
}
}
-void free_initmem(void)
+void __ref free_initmem(void)
{
+ e820_reallocate_tables();
+
free_init_pages("unused kernel",
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index bda8d5eef04d..ddd2661c4502 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -40,17 +40,26 @@
* You need to add an if/def entry if you introduce a new memory region
* compatible with KASLR. Your entry must be in logical order with memory
* layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to
+ * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
* ensure that this order is correct and won't be changed.
*/
static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
-static const unsigned long vaddr_end = VMEMMAP_START;
+
+#if defined(CONFIG_X86_ESPFIX64)
+static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
+#elif defined(CONFIG_EFI)
+static const unsigned long vaddr_end = EFI_VA_START;
+#else
+static const unsigned long vaddr_end = __START_KERNEL_map;
+#endif
/* Default values */
unsigned long page_offset_base = __PAGE_OFFSET_BASE;
EXPORT_SYMBOL(page_offset_base);
unsigned long vmalloc_base = __VMALLOC_BASE;
EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base = __VMEMMAP_BASE;
+EXPORT_SYMBOL(vmemmap_base);
/*
* Memory regions randomized by KASLR (except modules that use a separate logic
@@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region {
} kaslr_regions[] = {
{ &page_offset_base, 64/* Maximum */ },
{ &vmalloc_base, VMALLOC_SIZE_TB },
+ { &vmemmap_base, 1 },
};
/* Get size in bytes used by the memory region */
@@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void)
struct rnd_state rand_state;
unsigned long remain_entropy;
+ /*
+ * All these BUILD_BUG_ON checks ensures the memory layout is
+ * consistent with the vaddr_start/vaddr_end variables.
+ */
+ BUILD_BUG_ON(vaddr_start >= vaddr_end);
+ BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) &&
+ vaddr_end >= EFI_VA_START);
+ BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) ||
+ config_enabled(CONFIG_EFI)) &&
+ vaddr_end >= __START_KERNEL_map);
+ BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+
if (!kaslr_memory_enabled())
return;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 80476878eb4c..e4f800999b32 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -544,10 +544,9 @@ static int mpx_resolve_fault(long __user *addr, int write)
{
long gup_ret;
int nr_pages = 1;
- int force = 0;
- gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
- force, NULL, NULL);
+ gup_ret = get_user_pages((unsigned long)addr, nr_pages,
+ write ? FOLL_WRITE : 0, NULL, NULL);
/*
* get_user_pages() returns number of pages gotten.
* 0 means we failed to fault in and get anything,
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fb682108f4dc..3f35b48d1d9d 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -722,22 +722,19 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}
-static __init int find_near_online_node(int node)
+static void __init init_memory_less_node(int nid)
{
- int n, val;
- int min_val = INT_MAX;
- int best_node = -1;
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
- for_each_online_node(n) {
- val = node_distance(node, n);
+ /* Allocate and initialize node data. Memory-less node is now online.*/
+ alloc_node_data(nid);
+ free_area_init_node(nid, zones_size, 0, zholes_size);
- if (val < min_val) {
- min_val = val;
- best_node = n;
- }
- }
-
- return best_node;
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
}
/*
@@ -766,8 +763,10 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
+
if (!node_online(node))
- node = find_near_online_node(node);
+ init_memory_less_node(node);
+
numa_set_node(cpu, node);
}
}
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index de391b7bc19a..159b52ccd600 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -254,9 +254,7 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end)
struct memtype *rbt_memtype_lookup(u64 addr)
{
- struct memtype *data;
- data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
- return data;
+ return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
}
#if defined(CONFIG_DEBUG_FS)
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index e8c474451928..f88ce0e5efd9 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -11,6 +11,7 @@
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
+#include <linux/debugfs.h> /* debugfs_create_u32() */
#include <linux/mm_types.h> /* mm_struct, vma, etc... */
#include <linux/pkeys.h> /* PKEY_* */
#include <uapi/asm-generic/mman-common.h>
@@ -21,8 +22,19 @@
int __execute_only_pkey(struct mm_struct *mm)
{
+ bool need_to_set_mm_pkey = false;
+ int execute_only_pkey = mm->context.execute_only_pkey;
int ret;
+ /* Do we need to assign a pkey for mm's execute-only maps? */
+ if (execute_only_pkey == -1) {
+ /* Go allocate one to use, which might fail */
+ execute_only_pkey = mm_pkey_alloc(mm);
+ if (execute_only_pkey < 0)
+ return -1;
+ need_to_set_mm_pkey = true;
+ }
+
/*
* We do not want to go through the relatively costly
* dance to set PKRU if we do not need to. Check it
@@ -32,22 +44,33 @@ int __execute_only_pkey(struct mm_struct *mm)
* can make fpregs inactive.
*/
preempt_disable();
- if (fpregs_active() &&
- !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
+ if (!need_to_set_mm_pkey &&
+ fpregs_active() &&
+ !__pkru_allows_read(read_pkru(), execute_only_pkey)) {
preempt_enable();
- return PKEY_DEDICATED_EXECUTE_ONLY;
+ return execute_only_pkey;
}
preempt_enable();
- ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
+
+ /*
+ * Set up PKRU so that it denies access for everything
+ * other than execution.
+ */
+ ret = arch_set_user_pkey_access(current, execute_only_pkey,
PKEY_DISABLE_ACCESS);
/*
* If the PKRU-set operation failed somehow, just return
* 0 and effectively disable execute-only support.
*/
- if (ret)
- return 0;
+ if (ret) {
+ mm_set_pkey_free(mm, execute_only_pkey);
+ return -1;
+ }
- return PKEY_DEDICATED_EXECUTE_ONLY;
+ /* We got one, store it and use it from here on out */
+ if (need_to_set_mm_pkey)
+ mm->context.execute_only_pkey = execute_only_pkey;
+ return execute_only_pkey;
}
static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
@@ -55,7 +78,7 @@ static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
/* Do this check first since the vm_flags should be hot */
if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
return false;
- if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
+ if (vma_pkey(vma) != vma->vm_mm->context.execute_only_pkey)
return false;
return true;
@@ -99,3 +122,106 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
*/
return vma_pkey(vma);
}
+
+#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
+
+/*
+ * Make the default PKRU value (at execve() time) as restrictive
+ * as possible. This ensures that any threads clone()'d early
+ * in the process's lifetime will not accidentally get access
+ * to data which is pkey-protected later on.
+ */
+u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
+ PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
+ PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
+ PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
+ PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
+
+/*
+ * Called from the FPU code when creating a fresh set of FPU
+ * registers. This is called from a very specific context where
+ * we know the FPU regstiers are safe for use and we can use PKRU
+ * directly. The fact that PKRU is only available when we are
+ * using eagerfpu mode makes this possible.
+ */
+void copy_init_pkru_to_fpregs(void)
+{
+ u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value);
+ /*
+ * Any write to PKRU takes it out of the XSAVE 'init
+ * state' which increases context switch cost. Avoid
+ * writing 0 when PKRU was already 0.
+ */
+ if (!init_pkru_value_snapshot && !read_pkru())
+ return;
+ /*
+ * Override the PKRU state that came from 'init_fpstate'
+ * with the baseline from the process.
+ */
+ write_pkru(init_pkru_value_snapshot);
+}
+
+static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[32];
+ unsigned int len;
+
+ len = sprintf(buf, "0x%x\n", init_pkru_value);
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t init_pkru_write_file(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ ssize_t len;
+ u32 new_init_pkru;
+
+ len = min(count, sizeof(buf) - 1);
+ if (copy_from_user(buf, user_buf, len))
+ return -EFAULT;
+
+ /* Make the buffer a valid string that we can not overrun */
+ buf[len] = '\0';
+ if (kstrtouint(buf, 0, &new_init_pkru))
+ return -EINVAL;
+
+ /*
+ * Don't allow insane settings that will blow the system
+ * up immediately if someone attempts to disable access
+ * or writes to pkey 0.
+ */
+ if (new_init_pkru & (PKRU_AD_BIT|PKRU_WD_BIT))
+ return -EINVAL;
+
+ WRITE_ONCE(init_pkru_value, new_init_pkru);
+ return count;
+}
+
+static const struct file_operations fops_init_pkru = {
+ .read = init_pkru_read_file,
+ .write = init_pkru_write_file,
+ .llseek = default_llseek,
+};
+
+static int __init create_init_pkru_value(void)
+{
+ debugfs_create_file("init_pkru", S_IRUSR | S_IWUSR,
+ arch_debugfs_dir, NULL, &fops_init_pkru);
+ return 0;
+}
+late_initcall(create_init_pkru_value);
+
+static __init int setup_init_pkru(char *opt)
+{
+ u32 new_init_pkru;
+
+ if (kstrtouint(opt, 0, &new_init_pkru))
+ return 1;
+
+ WRITE_ONCE(init_pkru_value, new_init_pkru);
+
+ return 1;
+}
+__setup("init_pkru=", setup_init_pkru);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 4dbe65622810..a7655f6caf7d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ /*
+ * If our current stack is in vmalloc space and isn't
+ * mapped in the new pgd, we'll double-fault. Forcibly
+ * map it.
+ */
+ unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
+
+ pgd_t *pgd = next->pgd + stack_pgd_index;
+
+ if (unlikely(pgd_none(*pgd)))
+ set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+ }
+
#ifdef CONFIG_SMP
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next);
#endif
+
cpumask_set_cpu(cpu, mm_cpumask(next));
/*
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index cb31a4440e58..a2488b6e27d6 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -16,27 +16,7 @@
#include <asm/ptrace.h>
#include <asm/stacktrace.h>
-
-static int backtrace_stack(void *data, char *name)
-{
- /* Yes, we want all stacks */
- return 0;
-}
-
-static int backtrace_address(void *data, unsigned long addr, int reliable)
-{
- unsigned int *depth = data;
-
- if ((*depth)--)
- oprofile_add_trace(addr);
- return 0;
-}
-
-static struct stacktrace_ops backtrace_ops = {
- .stack = backtrace_stack,
- .address = backtrace_address,
- .walk_stack = print_context_stack,
-};
+#include <asm/unwind.h>
#ifdef CONFIG_COMPAT
static struct stack_frame_ia32 *
@@ -113,10 +93,29 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
if (!user_mode(regs)) {
- unsigned long stack = kernel_stack_pointer(regs);
- if (depth)
- dump_trace(NULL, regs, (unsigned long *)stack, 0,
- &backtrace_ops, &depth);
+ struct unwind_state state;
+ unsigned long addr;
+
+ if (!depth)
+ return;
+
+ oprofile_add_trace(regs->ip);
+
+ if (!--depth)
+ return;
+
+ for (unwind_start(&state, current, regs, NULL);
+ !unwind_done(&state); unwind_next_frame(&state)) {
+ addr = unwind_get_return_address(&state);
+ if (!addr)
+ break;
+
+ oprofile_add_trace(addr);
+
+ if (!--depth)
+ break;
+ }
+
return;
}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 97062a635b77..5c6fc3577a49 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -23,8 +23,6 @@ obj-y += bus_numa.o
obj-$(CONFIG_AMD_NB) += amd_bus.o
obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
-obj-$(CONFIG_VMD) += vmd.o
-
ifeq ($(CONFIG_PCI_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7b6a9d14c8c0..a4fdfa7dcc1b 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -677,6 +677,12 @@ static void set_dma_domain_ops(struct pci_dev *pdev)
static void set_dma_domain_ops(struct pci_dev *pdev) {}
#endif
+static void set_dev_domain_options(struct pci_dev *pdev)
+{
+ if (is_vmd(pdev->bus))
+ pdev->hotplug_user_indicators = 1;
+}
+
int pcibios_add_device(struct pci_dev *dev)
{
struct setup_data *data;
@@ -707,6 +713,7 @@ int pcibios_add_device(struct pci_dev *dev)
iounmap(data);
}
set_dma_domain_ops(dev);
+ set_dev_domain_options(dev);
return 0;
}
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 9770e55e768f..1d97cea3b3a4 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -120,9 +120,12 @@ static unsigned long __init bios32_service(unsigned long service)
static struct {
unsigned long address;
unsigned short segment;
-} pci_indirect = { 0, __KERNEL_CS };
+} pci_indirect __ro_after_init = {
+ .address = 0,
+ .segment = __KERNEL_CS,
+};
-static int pci_bios_present;
+static int pci_bios_present __ro_after_init;
static int __init check_pcibios(void)
{
diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
deleted file mode 100644
index 7948be342ee9..000000000000
--- a/arch/x86/pci/vmd.c
+++ /dev/null
@@ -1,771 +0,0 @@
-/*
- * Volume Management Device driver
- * Copyright (c) 2015, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- */
-
-#include <linux/device.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/msi.h>
-#include <linux/pci.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-
-#include <asm/irqdomain.h>
-#include <asm/device.h>
-#include <asm/msi.h>
-#include <asm/msidef.h>
-
-#define VMD_CFGBAR 0
-#define VMD_MEMBAR1 2
-#define VMD_MEMBAR2 4
-
-/*
- * Lock for manipulating VMD IRQ lists.
- */
-static DEFINE_RAW_SPINLOCK(list_lock);
-
-/**
- * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
- * @node: list item for parent traversal.
- * @rcu: RCU callback item for freeing.
- * @irq: back pointer to parent.
- * @enabled: true if driver enabled IRQ
- * @virq: the virtual IRQ value provided to the requesting driver.
- *
- * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
- * a VMD IRQ using this structure.
- */
-struct vmd_irq {
- struct list_head node;
- struct rcu_head rcu;
- struct vmd_irq_list *irq;
- bool enabled;
- unsigned int virq;
-};
-
-/**
- * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
- * @irq_list: the list of irq's the VMD one demuxes to.
- * @vmd_vector: the h/w IRQ assigned to the VMD.
- * @index: index into the VMD MSI-X table; used for message routing.
- * @count: number of child IRQs assigned to this vector; used to track
- * sharing.
- */
-struct vmd_irq_list {
- struct list_head irq_list;
- struct vmd_dev *vmd;
- unsigned int vmd_vector;
- unsigned int index;
- unsigned int count;
-};
-
-struct vmd_dev {
- struct pci_dev *dev;
-
- spinlock_t cfg_lock;
- char __iomem *cfgbar;
-
- int msix_count;
- struct msix_entry *msix_entries;
- struct vmd_irq_list *irqs;
-
- struct pci_sysdata sysdata;
- struct resource resources[3];
- struct irq_domain *irq_domain;
- struct pci_bus *bus;
-
-#ifdef CONFIG_X86_DEV_DMA_OPS
- struct dma_map_ops dma_ops;
- struct dma_domain dma_domain;
-#endif
-};
-
-static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
-{
- return container_of(bus->sysdata, struct vmd_dev, sysdata);
-}
-
-/*
- * Drivers managing a device in a VMD domain allocate their own IRQs as before,
- * but the MSI entry for the hardware it's driving will be programmed with a
- * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its
- * domain into one of its own, and the VMD driver de-muxes these for the
- * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations
- * and irq_chip to set this up.
- */
-static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
-{
- struct vmd_irq *vmdirq = data->chip_data;
- struct vmd_irq_list *irq = vmdirq->irq;
-
- msg->address_hi = MSI_ADDR_BASE_HI;
- msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_DEST_ID(irq->index);
- msg->data = 0;
-}
-
-/*
- * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops.
- */
-static void vmd_irq_enable(struct irq_data *data)
-{
- struct vmd_irq *vmdirq = data->chip_data;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&list_lock, flags);
- WARN_ON(vmdirq->enabled);
- list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
- vmdirq->enabled = true;
- raw_spin_unlock_irqrestore(&list_lock, flags);
-
- data->chip->irq_unmask(data);
-}
-
-static void vmd_irq_disable(struct irq_data *data)
-{
- struct vmd_irq *vmdirq = data->chip_data;
- unsigned long flags;
-
- data->chip->irq_mask(data);
-
- raw_spin_lock_irqsave(&list_lock, flags);
- if (vmdirq->enabled) {
- list_del_rcu(&vmdirq->node);
- vmdirq->enabled = false;
- }
- raw_spin_unlock_irqrestore(&list_lock, flags);
-}
-
-/*
- * XXX: Stubbed until we develop acceptable way to not create conflicts with
- * other devices sharing the same vector.
- */
-static int vmd_irq_set_affinity(struct irq_data *data,
- const struct cpumask *dest, bool force)
-{
- return -EINVAL;
-}
-
-static struct irq_chip vmd_msi_controller = {
- .name = "VMD-MSI",
- .irq_enable = vmd_irq_enable,
- .irq_disable = vmd_irq_disable,
- .irq_compose_msi_msg = vmd_compose_msi_msg,
- .irq_set_affinity = vmd_irq_set_affinity,
-};
-
-static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
- msi_alloc_info_t *arg)
-{
- return 0;
-}
-
-/*
- * XXX: We can be even smarter selecting the best IRQ once we solve the
- * affinity problem.
- */
-static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
-{
- int i, best = 1;
- unsigned long flags;
-
- if (!desc->msi_attrib.is_msix || vmd->msix_count == 1)
- return &vmd->irqs[0];
-
- raw_spin_lock_irqsave(&list_lock, flags);
- for (i = 1; i < vmd->msix_count; i++)
- if (vmd->irqs[i].count < vmd->irqs[best].count)
- best = i;
- vmd->irqs[best].count++;
- raw_spin_unlock_irqrestore(&list_lock, flags);
-
- return &vmd->irqs[best];
-}
-
-static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
- unsigned int virq, irq_hw_number_t hwirq,
- msi_alloc_info_t *arg)
-{
- struct msi_desc *desc = arg->desc;
- struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
- struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
-
- if (!vmdirq)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&vmdirq->node);
- vmdirq->irq = vmd_next_irq(vmd, desc);
- vmdirq->virq = virq;
-
- irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip,
- vmdirq, handle_untracked_irq, vmd, NULL);
- return 0;
-}
-
-static void vmd_msi_free(struct irq_domain *domain,
- struct msi_domain_info *info, unsigned int virq)
-{
- struct vmd_irq *vmdirq = irq_get_chip_data(virq);
- unsigned long flags;
-
- /* XXX: Potential optimization to rebalance */
- raw_spin_lock_irqsave(&list_lock, flags);
- vmdirq->irq->count--;
- raw_spin_unlock_irqrestore(&list_lock, flags);
-
- kfree_rcu(vmdirq, rcu);
-}
-
-static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
- int nvec, msi_alloc_info_t *arg)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
-
- if (nvec > vmd->msix_count)
- return vmd->msix_count;
-
- memset(arg, 0, sizeof(*arg));
- return 0;
-}
-
-static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
-{
- arg->desc = desc;
-}
-
-static struct msi_domain_ops vmd_msi_domain_ops = {
- .get_hwirq = vmd_get_hwirq,
- .msi_init = vmd_msi_init,
- .msi_free = vmd_msi_free,
- .msi_prepare = vmd_msi_prepare,
- .set_desc = vmd_set_desc,
-};
-
-static struct msi_domain_info vmd_msi_domain_info = {
- .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
- MSI_FLAG_PCI_MSIX,
- .ops = &vmd_msi_domain_ops,
- .chip = &vmd_msi_controller,
-};
-
-#ifdef CONFIG_X86_DEV_DMA_OPS
-/*
- * VMD replaces the requester ID with its own. DMA mappings for devices in a
- * VMD domain need to be mapped for the VMD, not the device requiring
- * the mapping.
- */
-static struct device *to_vmd_dev(struct device *dev)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
-
- return &vmd->dev->dev;
-}
-
-static struct dma_map_ops *vmd_dma_ops(struct device *dev)
-{
- return get_dma_ops(to_vmd_dev(dev));
-}
-
-static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
- gfp_t flag, unsigned long attrs)
-{
- return vmd_dma_ops(dev)->alloc(to_vmd_dev(dev), size, addr, flag,
- attrs);
-}
-
-static void vmd_free(struct device *dev, size_t size, void *vaddr,
- dma_addr_t addr, unsigned long attrs)
-{
- return vmd_dma_ops(dev)->free(to_vmd_dev(dev), size, vaddr, addr,
- attrs);
-}
-
-static int vmd_mmap(struct device *dev, struct vm_area_struct *vma,
- void *cpu_addr, dma_addr_t addr, size_t size,
- unsigned long attrs)
-{
- return vmd_dma_ops(dev)->mmap(to_vmd_dev(dev), vma, cpu_addr, addr,
- size, attrs);
-}
-
-static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt,
- void *cpu_addr, dma_addr_t addr, size_t size,
- unsigned long attrs)
-{
- return vmd_dma_ops(dev)->get_sgtable(to_vmd_dev(dev), sgt, cpu_addr,
- addr, size, attrs);
-}
-
-static dma_addr_t vmd_map_page(struct device *dev, struct page *page,
- unsigned long offset, size_t size,
- enum dma_data_direction dir,
- unsigned long attrs)
-{
- return vmd_dma_ops(dev)->map_page(to_vmd_dev(dev), page, offset, size,
- dir, attrs);
-}
-
-static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size,
- enum dma_data_direction dir, unsigned long attrs)
-{
- vmd_dma_ops(dev)->unmap_page(to_vmd_dev(dev), addr, size, dir, attrs);
-}
-
-static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, unsigned long attrs)
-{
- return vmd_dma_ops(dev)->map_sg(to_vmd_dev(dev), sg, nents, dir, attrs);
-}
-
-static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
- enum dma_data_direction dir, unsigned long attrs)
-{
- vmd_dma_ops(dev)->unmap_sg(to_vmd_dev(dev), sg, nents, dir, attrs);
-}
-
-static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir)
-{
- vmd_dma_ops(dev)->sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir);
-}
-
-static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr,
- size_t size, enum dma_data_direction dir)
-{
- vmd_dma_ops(dev)->sync_single_for_device(to_vmd_dev(dev), addr, size,
- dir);
-}
-
-static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir)
-{
- vmd_dma_ops(dev)->sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir);
-}
-
-static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
- int nents, enum dma_data_direction dir)
-{
- vmd_dma_ops(dev)->sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir);
-}
-
-static int vmd_mapping_error(struct device *dev, dma_addr_t addr)
-{
- return vmd_dma_ops(dev)->mapping_error(to_vmd_dev(dev), addr);
-}
-
-static int vmd_dma_supported(struct device *dev, u64 mask)
-{
- return vmd_dma_ops(dev)->dma_supported(to_vmd_dev(dev), mask);
-}
-
-#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
-static u64 vmd_get_required_mask(struct device *dev)
-{
- return vmd_dma_ops(dev)->get_required_mask(to_vmd_dev(dev));
-}
-#endif
-
-static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
-{
- struct dma_domain *domain = &vmd->dma_domain;
-
- if (get_dma_ops(&vmd->dev->dev))
- del_dma_domain(domain);
-}
-
-#define ASSIGN_VMD_DMA_OPS(source, dest, fn) \
- do { \
- if (source->fn) \
- dest->fn = vmd_##fn; \
- } while (0)
-
-static void vmd_setup_dma_ops(struct vmd_dev *vmd)
-{
- const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev);
- struct dma_map_ops *dest = &vmd->dma_ops;
- struct dma_domain *domain = &vmd->dma_domain;
-
- domain->domain_nr = vmd->sysdata.domain;
- domain->dma_ops = dest;
-
- if (!source)
- return;
- ASSIGN_VMD_DMA_OPS(source, dest, alloc);
- ASSIGN_VMD_DMA_OPS(source, dest, free);
- ASSIGN_VMD_DMA_OPS(source, dest, mmap);
- ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable);
- ASSIGN_VMD_DMA_OPS(source, dest, map_page);
- ASSIGN_VMD_DMA_OPS(source, dest, unmap_page);
- ASSIGN_VMD_DMA_OPS(source, dest, map_sg);
- ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg);
- ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu);
- ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device);
- ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu);
- ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device);
- ASSIGN_VMD_DMA_OPS(source, dest, mapping_error);
- ASSIGN_VMD_DMA_OPS(source, dest, dma_supported);
-#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
- ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask);
-#endif
- add_dma_domain(domain);
-}
-#undef ASSIGN_VMD_DMA_OPS
-#else
-static void vmd_teardown_dma_ops(struct vmd_dev *vmd) {}
-static void vmd_setup_dma_ops(struct vmd_dev *vmd) {}
-#endif
-
-static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
- unsigned int devfn, int reg, int len)
-{
- char __iomem *addr = vmd->cfgbar +
- (bus->number << 20) + (devfn << 12) + reg;
-
- if ((addr - vmd->cfgbar) + len >=
- resource_size(&vmd->dev->resource[VMD_CFGBAR]))
- return NULL;
-
- return addr;
-}
-
-/*
- * CPU may deadlock if config space is not serialized on some versions of this
- * hardware, so all config space access is done under a spinlock.
- */
-static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
- int len, u32 *value)
-{
- struct vmd_dev *vmd = vmd_from_bus(bus);
- char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
- unsigned long flags;
- int ret = 0;
-
- if (!addr)
- return -EFAULT;
-
- spin_lock_irqsave(&vmd->cfg_lock, flags);
- switch (len) {
- case 1:
- *value = readb(addr);
- break;
- case 2:
- *value = readw(addr);
- break;
- case 4:
- *value = readl(addr);
- break;
- default:
- ret = -EINVAL;
- break;
- }
- spin_unlock_irqrestore(&vmd->cfg_lock, flags);
- return ret;
-}
-
-/*
- * VMD h/w converts non-posted config writes to posted memory writes. The
- * read-back in this function forces the completion so it returns only after
- * the config space was written, as expected.
- */
-static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
- int len, u32 value)
-{
- struct vmd_dev *vmd = vmd_from_bus(bus);
- char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
- unsigned long flags;
- int ret = 0;
-
- if (!addr)
- return -EFAULT;
-
- spin_lock_irqsave(&vmd->cfg_lock, flags);
- switch (len) {
- case 1:
- writeb(value, addr);
- readb(addr);
- break;
- case 2:
- writew(value, addr);
- readw(addr);
- break;
- case 4:
- writel(value, addr);
- readl(addr);
- break;
- default:
- ret = -EINVAL;
- break;
- }
- spin_unlock_irqrestore(&vmd->cfg_lock, flags);
- return ret;
-}
-
-static struct pci_ops vmd_ops = {
- .read = vmd_pci_read,
- .write = vmd_pci_write,
-};
-
-static void vmd_attach_resources(struct vmd_dev *vmd)
-{
- vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1];
- vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2];
-}
-
-static void vmd_detach_resources(struct vmd_dev *vmd)
-{
- vmd->dev->resource[VMD_MEMBAR1].child = NULL;
- vmd->dev->resource[VMD_MEMBAR2].child = NULL;
-}
-
-/*
- * VMD domains start at 0x1000 to not clash with ACPI _SEG domains.
- */
-static int vmd_find_free_domain(void)
-{
- int domain = 0xffff;
- struct pci_bus *bus = NULL;
-
- while ((bus = pci_find_next_bus(bus)) != NULL)
- domain = max_t(int, domain, pci_domain_nr(bus));
- return domain + 1;
-}
-
-static int vmd_enable_domain(struct vmd_dev *vmd)
-{
- struct pci_sysdata *sd = &vmd->sysdata;
- struct resource *res;
- u32 upper_bits;
- unsigned long flags;
- LIST_HEAD(resources);
-
- res = &vmd->dev->resource[VMD_CFGBAR];
- vmd->resources[0] = (struct resource) {
- .name = "VMD CFGBAR",
- .start = 0,
- .end = (resource_size(res) >> 20) - 1,
- .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
- };
-
- /*
- * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
- * put 32-bit resources in the window.
- *
- * There's no hardware reason why a 64-bit window *couldn't*
- * contain a 32-bit resource, but pbus_size_mem() computes the
- * bridge window size assuming a 64-bit window will contain no
- * 32-bit resources. __pci_assign_resource() enforces that
- * artificial restriction to make sure everything will fit.
- *
- * The only way we could use a 64-bit non-prefechable MEMBAR is
- * if its address is <4GB so that we can convert it to a 32-bit
- * resource. To be visible to the host OS, all VMD endpoints must
- * be initially configured by platform BIOS, which includes setting
- * up these resources. We can assume the device is configured
- * according to the platform needs.
- */
- res = &vmd->dev->resource[VMD_MEMBAR1];
- upper_bits = upper_32_bits(res->end);
- flags = res->flags & ~IORESOURCE_SIZEALIGN;
- if (!upper_bits)
- flags &= ~IORESOURCE_MEM_64;
- vmd->resources[1] = (struct resource) {
- .name = "VMD MEMBAR1",
- .start = res->start,
- .end = res->end,
- .flags = flags,
- .parent = res,
- };
-
- res = &vmd->dev->resource[VMD_MEMBAR2];
- upper_bits = upper_32_bits(res->end);
- flags = res->flags & ~IORESOURCE_SIZEALIGN;
- if (!upper_bits)
- flags &= ~IORESOURCE_MEM_64;
- vmd->resources[2] = (struct resource) {
- .name = "VMD MEMBAR2",
- .start = res->start + 0x2000,
- .end = res->end,
- .flags = flags,
- .parent = res,
- };
-
- sd->domain = vmd_find_free_domain();
- if (sd->domain < 0)
- return sd->domain;
-
- sd->node = pcibus_to_node(vmd->dev->bus);
-
- vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info,
- x86_vector_domain);
- if (!vmd->irq_domain)
- return -ENODEV;
-
- pci_add_resource(&resources, &vmd->resources[0]);
- pci_add_resource(&resources, &vmd->resources[1]);
- pci_add_resource(&resources, &vmd->resources[2]);
- vmd->bus = pci_create_root_bus(&vmd->dev->dev, 0, &vmd_ops, sd,
- &resources);
- if (!vmd->bus) {
- pci_free_resource_list(&resources);
- irq_domain_remove(vmd->irq_domain);
- return -ENODEV;
- }
-
- vmd_attach_resources(vmd);
- vmd_setup_dma_ops(vmd);
- dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
- pci_rescan_bus(vmd->bus);
-
- WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj,
- "domain"), "Can't create symlink to domain\n");
- return 0;
-}
-
-static irqreturn_t vmd_irq(int irq, void *data)
-{
- struct vmd_irq_list *irqs = data;
- struct vmd_irq *vmdirq;
-
- rcu_read_lock();
- list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
- generic_handle_irq(vmdirq->virq);
- rcu_read_unlock();
-
- return IRQ_HANDLED;
-}
-
-static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
- struct vmd_dev *vmd;
- int i, err;
-
- if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
- return -ENOMEM;
-
- vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL);
- if (!vmd)
- return -ENOMEM;
-
- vmd->dev = dev;
- err = pcim_enable_device(dev);
- if (err < 0)
- return err;
-
- vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0);
- if (!vmd->cfgbar)
- return -ENOMEM;
-
- pci_set_master(dev);
- if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) &&
- dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)))
- return -ENODEV;
-
- vmd->msix_count = pci_msix_vec_count(dev);
- if (vmd->msix_count < 0)
- return -ENODEV;
-
- vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
- GFP_KERNEL);
- if (!vmd->irqs)
- return -ENOMEM;
-
- vmd->msix_entries = devm_kcalloc(&dev->dev, vmd->msix_count,
- sizeof(*vmd->msix_entries),
- GFP_KERNEL);
- if (!vmd->msix_entries)
- return -ENOMEM;
- for (i = 0; i < vmd->msix_count; i++)
- vmd->msix_entries[i].entry = i;
-
- vmd->msix_count = pci_enable_msix_range(vmd->dev, vmd->msix_entries, 1,
- vmd->msix_count);
- if (vmd->msix_count < 0)
- return vmd->msix_count;
-
- for (i = 0; i < vmd->msix_count; i++) {
- INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
- vmd->irqs[i].vmd_vector = vmd->msix_entries[i].vector;
- vmd->irqs[i].index = i;
-
- err = devm_request_irq(&dev->dev, vmd->irqs[i].vmd_vector,
- vmd_irq, 0, "vmd", &vmd->irqs[i]);
- if (err)
- return err;
- }
-
- spin_lock_init(&vmd->cfg_lock);
- pci_set_drvdata(dev, vmd);
- err = vmd_enable_domain(vmd);
- if (err)
- return err;
-
- dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n",
- vmd->sysdata.domain);
- return 0;
-}
-
-static void vmd_remove(struct pci_dev *dev)
-{
- struct vmd_dev *vmd = pci_get_drvdata(dev);
-
- vmd_detach_resources(vmd);
- pci_set_drvdata(dev, NULL);
- sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
- pci_stop_root_bus(vmd->bus);
- pci_remove_root_bus(vmd->bus);
- vmd_teardown_dma_ops(vmd);
- irq_domain_remove(vmd->irq_domain);
-}
-
-#ifdef CONFIG_PM
-static int vmd_suspend(struct device *dev)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
-
- pci_save_state(pdev);
- return 0;
-}
-
-static int vmd_resume(struct device *dev)
-{
- struct pci_dev *pdev = to_pci_dev(dev);
-
- pci_restore_state(pdev);
- return 0;
-}
-#endif
-static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
-
-static const struct pci_device_id vmd_ids[] = {
- {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x201d),},
- {0,}
-};
-MODULE_DEVICE_TABLE(pci, vmd_ids);
-
-static struct pci_driver vmd_drv = {
- .name = "vmd",
- .id_table = vmd_ids,
- .probe = vmd_probe,
- .remove = vmd_remove,
- .driver = {
- .pm = &vmd_dev_pm_ops,
- },
-};
-module_pci_driver(vmd_drv);
-
-MODULE_AUTHOR("Intel Corporation");
-MODULE_LICENSE("GPL v2");
-MODULE_VERSION("0.6");
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 3a483cb5ac81..bedfab98077a 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -456,7 +456,7 @@ void __init xen_msi_init(void)
int __init pci_xen_hvm_init(void)
{
- if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs))
+ if (!xen_feature(XENFEAT_hvm_pirqs))
return 0;
#ifdef CONFIG_ACPI
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 184842ef332e..3c3c19ea94df 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -8,6 +8,7 @@ obj-y += iris/
obj-y += intel/
obj-y += intel-mid/
obj-y += intel-quark/
+obj-y += mellanox/
obj-y += olpc/
obj-y += scx200/
obj-y += sfi/
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index 8ff7b9355416..d49d3be81953 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -155,7 +155,7 @@ static void punit_dbgfs_unregister(void)
static const struct x86_cpu_id intel_punit_cpu_ids[] = {
ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt),
- ICPU(INTEL_FAM6_ATOM_MERRIFIELD1, punit_device_tng),
+ ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng),
ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht),
{}
};
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index 6a2f5691b1ab..6aad870e8962 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -82,21 +82,12 @@ void __init efi_bgrt_init(void)
}
bgrt_image_size = bmp_header.size;
- bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
+ bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
if (!bgrt_image) {
- pr_notice("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n",
- bgrt_image_size);
- return;
- }
-
- image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
- if (!image) {
pr_notice("Ignoring BGRT: failed to map image memory\n");
- kfree(bgrt_image);
bgrt_image = NULL;
return;
}
- memcpy(bgrt_image, image, bgrt_image_size);
- memunmap(image);
+ efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size);
}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 1fbb408e2e72..bf99aa7005eb 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -166,13 +166,15 @@ static void __init do_add_efi_memmap(void)
}
e820_add_region(start, size, e820_type);
}
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
}
int __init efi_memblock_x86_reserve_range(void)
{
struct efi_info *e = &boot_params.efi_info;
+ struct efi_memory_map_data data;
phys_addr_t pmap;
+ int rv;
if (efi_enabled(EFI_PARAVIRT))
return 0;
@@ -187,11 +189,17 @@ int __init efi_memblock_x86_reserve_range(void)
#else
pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
#endif
- efi.memmap.phys_map = pmap;
- efi.memmap.nr_map = e->efi_memmap_size /
- e->efi_memdesc_size;
- efi.memmap.desc_size = e->efi_memdesc_size;
- efi.memmap.desc_version = e->efi_memdesc_version;
+ data.phys_map = pmap;
+ data.size = e->efi_memmap_size;
+ data.desc_size = e->efi_memdesc_size;
+ data.desc_version = e->efi_memdesc_version;
+
+ rv = efi_memmap_init_early(&data);
+ if (rv)
+ return rv;
+
+ if (add_efi_memmap)
+ do_add_efi_memmap();
WARN(efi.memmap.desc_version != 1,
"Unexpected EFI_MEMORY_DESCRIPTOR version %ld",
@@ -218,19 +226,6 @@ void __init efi_print_memmap(void)
}
}
-void __init efi_unmap_memmap(void)
-{
- unsigned long size;
-
- clear_bit(EFI_MEMMAP, &efi.flags);
-
- size = efi.memmap.nr_map * efi.memmap.desc_size;
- if (efi.memmap.map) {
- early_memunmap(efi.memmap.map, size);
- efi.memmap.map = NULL;
- }
-}
-
static int __init efi_systab_init(void *phys)
{
if (efi_enabled(EFI_64BIT)) {
@@ -414,33 +409,6 @@ static int __init efi_runtime_init(void)
return 0;
}
-static int __init efi_memmap_init(void)
-{
- unsigned long addr, size;
-
- if (efi_enabled(EFI_PARAVIRT))
- return 0;
-
- /* Map the EFI memory map */
- size = efi.memmap.nr_map * efi.memmap.desc_size;
- addr = (unsigned long)efi.memmap.phys_map;
-
- efi.memmap.map = early_memremap(addr, size);
- if (efi.memmap.map == NULL) {
- pr_err("Could not map the memory map!\n");
- return -ENOMEM;
- }
-
- efi.memmap.map_end = efi.memmap.map + size;
-
- if (add_efi_memmap)
- do_add_efi_memmap();
-
- set_bit(EFI_MEMMAP, &efi.flags);
-
- return 0;
-}
-
void __init efi_init(void)
{
efi_char16_t *c16;
@@ -498,16 +466,14 @@ void __init efi_init(void)
if (!efi_runtime_supported())
pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
else {
- if (efi_runtime_disabled() || efi_runtime_init())
+ if (efi_runtime_disabled() || efi_runtime_init()) {
+ efi_memmap_unmap();
return;
+ }
}
- if (efi_memmap_init())
- return;
if (efi_enabled(EFI_DBG))
efi_print_memmap();
-
- efi_esrt_init();
}
void __init efi_late_init(void)
@@ -624,42 +590,6 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
}
}
-static void __init save_runtime_map(void)
-{
-#ifdef CONFIG_KEXEC_CORE
- unsigned long desc_size;
- efi_memory_desc_t *md;
- void *tmp, *q = NULL;
- int count = 0;
-
- if (efi_enabled(EFI_OLD_MEMMAP))
- return;
-
- desc_size = efi.memmap.desc_size;
-
- for_each_efi_memory_desc(md) {
- if (!(md->attribute & EFI_MEMORY_RUNTIME) ||
- (md->type == EFI_BOOT_SERVICES_CODE) ||
- (md->type == EFI_BOOT_SERVICES_DATA))
- continue;
- tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL);
- if (!tmp)
- goto out;
- q = tmp;
-
- memcpy(q + count * desc_size, md, desc_size);
- count++;
- }
-
- efi_runtime_map_setup(q, count, desc_size);
- return;
-
-out:
- kfree(q);
- pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n");
-#endif
-}
-
static void *realloc_pages(void *old_memmap, int old_shift)
{
void *ret;
@@ -745,6 +675,46 @@ static void *efi_map_next_entry(void *entry)
return entry;
}
+static bool should_map_region(efi_memory_desc_t *md)
+{
+ /*
+ * Runtime regions always require runtime mappings (obviously).
+ */
+ if (md->attribute & EFI_MEMORY_RUNTIME)
+ return true;
+
+ /*
+ * 32-bit EFI doesn't suffer from the bug that requires us to
+ * reserve boot services regions, and mixed mode support
+ * doesn't exist for 32-bit kernels.
+ */
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
+ /*
+ * Map all of RAM so that we can access arguments in the 1:1
+ * mapping when making EFI runtime calls.
+ */
+ if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_is_native()) {
+ if (md->type == EFI_CONVENTIONAL_MEMORY ||
+ md->type == EFI_LOADER_DATA ||
+ md->type == EFI_LOADER_CODE)
+ return true;
+ }
+
+ /*
+ * Map boot services regions as a workaround for buggy
+ * firmware that accesses them even when they shouldn't.
+ *
+ * See efi_{reserve,free}_boot_services().
+ */
+ if (md->type == EFI_BOOT_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_DATA)
+ return true;
+
+ return false;
+}
+
/*
* Map the efi memory ranges of the runtime services and update new_mmap with
* virtual addresses.
@@ -761,13 +731,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
p = NULL;
while ((p = efi_map_next_entry(p))) {
md = p;
- if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
-#ifdef CONFIG_X86_64
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
-#endif
- continue;
- }
+
+ if (!should_map_region(md))
+ continue;
efi_map_region(md);
get_systab_virt_addr(md);
@@ -803,7 +769,7 @@ static void __init kexec_enter_virtual_mode(void)
* non-native EFI
*/
if (!efi_is_native()) {
- efi_unmap_memmap();
+ efi_memmap_unmap();
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
}
@@ -823,7 +789,18 @@ static void __init kexec_enter_virtual_mode(void)
get_systab_virt_addr(md);
}
- save_runtime_map();
+ /*
+ * Unregister the early EFI memmap from efi_init() and install
+ * the new EFI memory map.
+ */
+ efi_memmap_unmap();
+
+ if (efi_memmap_init_late(efi.memmap.phys_map,
+ efi.memmap.desc_size * efi.memmap.nr_map)) {
+ pr_err("Failed to remap late EFI memory map\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
BUG_ON(!efi.systab);
@@ -884,6 +861,7 @@ static void __init __efi_enter_virtual_mode(void)
int count = 0, pg_shift = 0;
void *new_memmap = NULL;
efi_status_t status;
+ phys_addr_t pa;
efi.systab = NULL;
@@ -901,11 +879,24 @@ static void __init __efi_enter_virtual_mode(void)
return;
}
- save_runtime_map();
+ pa = __pa(new_memmap);
+
+ /*
+ * Unregister the early EFI memmap from efi_init() and install
+ * the new EFI memory map that we are about to pass to the
+ * firmware via SetVirtualAddressMap().
+ */
+ efi_memmap_unmap();
+
+ if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) {
+ pr_err("Failed to remap late EFI memory map\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
BUG_ON(!efi.systab);
- if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) {
+ if (efi_setup_page_tables(pa, 1 << pg_shift)) {
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
}
@@ -917,14 +908,14 @@ static void __init __efi_enter_virtual_mode(void)
efi.memmap.desc_size * count,
efi.memmap.desc_size,
efi.memmap.desc_version,
- (efi_memory_desc_t *)__pa(new_memmap));
+ (efi_memory_desc_t *)pa);
} else {
status = efi_thunk_set_virtual_address_map(
efi_phys.set_virtual_address_map,
efi.memmap.desc_size * count,
efi.memmap.desc_size,
efi.memmap.desc_version,
- (efi_memory_desc_t *)__pa(new_memmap));
+ (efi_memory_desc_t *)pa);
}
if (status != EFI_SUCCESS) {
@@ -956,15 +947,6 @@ static void __init __efi_enter_virtual_mode(void)
efi_runtime_update_mappings();
efi_dump_pagetable();
- /*
- * We mapped the descriptor array into the EFI pagetable above
- * but we're not unmapping it here because if we're running in
- * EFI mixed mode we need all of memory to be accessible when
- * we pass parameters to the EFI runtime services in the
- * thunking code.
- */
- free_pages((unsigned long)new_memmap, pg_shift);
-
/* clean DUMMY object */
efi_delete_dummy_variable();
}
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8dd3784eb075..58b0f801f66f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -85,7 +85,7 @@ pgd_t * __init efi_call_phys_prolog(void)
early_code_mapping_set_exec(1);
n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
- save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
+ save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
for (pgd = 0; pgd < n_pgds; pgd++) {
save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE);
@@ -214,7 +214,6 @@ void efi_sync_low_kernel_mappings(void)
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
unsigned long pfn, text;
- efi_memory_desc_t *md;
struct page *page;
unsigned npages;
pgd_t *pgd;
@@ -248,25 +247,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native())
return 0;
- /*
- * Map all of RAM so that we can access arguments in the 1:1
- * mapping when making EFI runtime calls.
- */
- for_each_efi_memory_desc(md) {
- if (md->type != EFI_CONVENTIONAL_MEMORY &&
- md->type != EFI_LOADER_DATA &&
- md->type != EFI_LOADER_CODE)
- continue;
-
- pfn = md->phys_addr >> PAGE_SHIFT;
- npages = md->num_pages;
-
- if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) {
- pr_err("Failed to map 1:1 memory\n");
- return 1;
- }
- }
-
page = alloc_page(GFP_KERNEL|__GFP_DMA32);
if (!page)
panic("Unable to allocate EFI runtime stack < 4GB\n");
@@ -359,6 +339,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
*/
void __init efi_map_region_fixed(efi_memory_desc_t *md)
{
+ __map_region(md, md->phys_addr);
__map_region(md, md->virt_addr);
}
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 89d1146f5a6f..10aca63a50d7 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -164,6 +164,75 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size,
EXPORT_SYMBOL_GPL(efi_query_variable_store);
/*
+ * The UEFI specification makes it clear that the operating system is
+ * free to do whatever it wants with boot services code after
+ * ExitBootServices() has been called. Ignoring this recommendation a
+ * significant bunch of EFI implementations continue calling into boot
+ * services code (SetVirtualAddressMap). In order to work around such
+ * buggy implementations we reserve boot services region during EFI
+ * init and make sure it stays executable. Then, after
+ * SetVirtualAddressMap(), it is discarded.
+ *
+ * However, some boot services regions contain data that is required
+ * by drivers, so we need to track which memory ranges can never be
+ * freed. This is done by tagging those regions with the
+ * EFI_MEMORY_RUNTIME attribute.
+ *
+ * Any driver that wants to mark a region as reserved must use
+ * efi_mem_reserve() which will insert a new EFI memory descriptor
+ * into efi.memmap (splitting existing regions if necessary) and tag
+ * it with EFI_MEMORY_RUNTIME.
+ */
+void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
+{
+ phys_addr_t new_phys, new_size;
+ struct efi_mem_range mr;
+ efi_memory_desc_t md;
+ int num_entries;
+ void *new;
+
+ if (efi_mem_desc_lookup(addr, &md)) {
+ pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr);
+ return;
+ }
+
+ if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) {
+ pr_err("Region spans EFI memory descriptors, %pa\n", &addr);
+ return;
+ }
+
+ size += addr % EFI_PAGE_SIZE;
+ size = round_up(size, EFI_PAGE_SIZE);
+ addr = round_down(addr, EFI_PAGE_SIZE);
+
+ mr.range.start = addr;
+ mr.range.end = addr + size - 1;
+ mr.attribute = md.attribute | EFI_MEMORY_RUNTIME;
+
+ num_entries = efi_memmap_split_count(&md, &mr.range);
+ num_entries += efi.memmap.nr_map;
+
+ new_size = efi.memmap.desc_size * num_entries;
+
+ new_phys = memblock_alloc(new_size, 0);
+ if (!new_phys) {
+ pr_err("Could not allocate boot services memmap\n");
+ return;
+ }
+
+ new = early_memremap(new_phys, new_size);
+ if (!new) {
+ pr_err("Failed to map new boot services memmap\n");
+ return;
+ }
+
+ efi_memmap_insert(&efi.memmap, new, &mr);
+ early_memunmap(new, new_size);
+
+ efi_memmap_install(new_phys, num_entries);
+}
+
+/*
* Helper function for efi_reserve_boot_services() to figure out if we
* can free regions in efi_free_boot_services().
*
@@ -184,15 +253,6 @@ static bool can_free_region(u64 start, u64 size)
return true;
}
-/*
- * The UEFI specification makes it clear that the operating system is free to do
- * whatever it wants with boot services code after ExitBootServices() has been
- * called. Ignoring this recommendation a significant bunch of EFI implementations
- * continue calling into boot services code (SetVirtualAddressMap). In order to
- * work around such buggy implementations we reserve boot services region during
- * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it
-* is discarded.
-*/
void __init efi_reserve_boot_services(void)
{
efi_memory_desc_t *md;
@@ -249,7 +309,10 @@ void __init efi_reserve_boot_services(void)
void __init efi_free_boot_services(void)
{
+ phys_addr_t new_phys, new_size;
efi_memory_desc_t *md;
+ int num_entries = 0;
+ void *new, *new_md;
for_each_efi_memory_desc(md) {
unsigned long long start = md->phys_addr;
@@ -257,12 +320,16 @@ void __init efi_free_boot_services(void)
size_t rm_size;
if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
+ md->type != EFI_BOOT_SERVICES_DATA) {
+ num_entries++;
continue;
+ }
/* Do not free, someone else owns it: */
- if (md->attribute & EFI_MEMORY_RUNTIME)
+ if (md->attribute & EFI_MEMORY_RUNTIME) {
+ num_entries++;
continue;
+ }
/*
* Nasty quirk: if all sub-1MB memory is used for boot
@@ -287,7 +354,41 @@ void __init efi_free_boot_services(void)
free_bootmem_late(start, size);
}
- efi_unmap_memmap();
+ new_size = efi.memmap.desc_size * num_entries;
+ new_phys = memblock_alloc(new_size, 0);
+ if (!new_phys) {
+ pr_err("Failed to allocate new EFI memmap\n");
+ return;
+ }
+
+ new = memremap(new_phys, new_size, MEMREMAP_WB);
+ if (!new) {
+ pr_err("Failed to map new EFI memmap\n");
+ return;
+ }
+
+ /*
+ * Build a new EFI memmap that excludes any boot services
+ * regions that are not tagged EFI_MEMORY_RUNTIME, since those
+ * regions have now been freed.
+ */
+ new_md = new;
+ for_each_efi_memory_desc(md) {
+ if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
+ (md->type == EFI_BOOT_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_DATA))
+ continue;
+
+ memcpy(new_md, md, efi.memmap.desc_size);
+ new_md += efi.memmap.desc_size;
+ }
+
+ memunmap(new);
+
+ if (efi_memmap_install(new_phys, num_entries)) {
+ pr_err("Could not install new EFI memmap\n");
+ return;
+ }
}
/*
@@ -365,7 +466,7 @@ void __init efi_apply_memmap_quirks(void)
*/
if (!efi_runtime_supported()) {
pr_info("Setup done, disabling due to 32/64-bit mismatch\n");
- efi_unmap_memmap();
+ efi_memmap_unmap();
}
/* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index fc135bf70511..429d08be7848 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -1,5 +1,9 @@
# Family-Level Interface Shim (FLIS)
obj-$(subst m,y,$(CONFIG_PINCTRL_MERRIFIELD)) += platform_mrfld_pinctrl.o
+# SDHCI Devices
+obj-$(subst m,y,$(CONFIG_MMC_SDHCI_PCI)) += platform_mrfld_sd.o
+# WiFi
+obj-$(subst m,y,$(CONFIG_BRCMFMAC_SDIO)) += platform_bcm43xx.o
# IPC Devices
obj-y += platform_ipc.o
obj-$(subst m,y,$(CONFIG_MFD_INTEL_MSIC)) += platform_msic.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
new file mode 100644
index 000000000000..4392c15ed9e0
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
@@ -0,0 +1,95 @@
+/*
+ * platform_bcm43xx.c: bcm43xx platform data initilization file
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/gpio.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/fixed.h>
+#include <linux/sfi.h>
+
+#include <asm/intel-mid.h>
+
+#define WLAN_SFI_GPIO_IRQ_NAME "WLAN-interrupt"
+#define WLAN_SFI_GPIO_ENABLE_NAME "WLAN-enable"
+
+#define WLAN_DEV_NAME "0000:00:01.3"
+
+static struct regulator_consumer_supply bcm43xx_vmmc_supply = {
+ .dev_name = WLAN_DEV_NAME,
+ .supply = "vmmc",
+};
+
+static struct regulator_init_data bcm43xx_vmmc_data = {
+ .constraints = {
+ .valid_ops_mask = REGULATOR_CHANGE_STATUS,
+ },
+ .num_consumer_supplies = 1,
+ .consumer_supplies = &bcm43xx_vmmc_supply,
+};
+
+static struct fixed_voltage_config bcm43xx_vmmc = {
+ .supply_name = "bcm43xx-vmmc-regulator",
+ /*
+ * Announce 2.0V here to be compatible with SDIO specification. The
+ * real voltage and signaling are still 1.8V.
+ */
+ .microvolts = 2000000, /* 1.8V */
+ .gpio = -EINVAL,
+ .startup_delay = 250 * 1000, /* 250ms */
+ .enable_high = 1, /* active high */
+ .enabled_at_boot = 0, /* disabled at boot */
+ .init_data = &bcm43xx_vmmc_data,
+};
+
+static struct platform_device bcm43xx_vmmc_regulator = {
+ .name = "reg-fixed-voltage",
+ .id = PLATFORM_DEVID_AUTO,
+ .dev = {
+ .platform_data = &bcm43xx_vmmc,
+ },
+};
+
+static int __init bcm43xx_regulator_register(void)
+{
+ int ret;
+
+ bcm43xx_vmmc.gpio = get_gpio_by_name(WLAN_SFI_GPIO_ENABLE_NAME);
+ ret = platform_device_register(&bcm43xx_vmmc_regulator);
+ if (ret) {
+ pr_err("%s: vmmc regulator register failed\n", __func__);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __init *bcm43xx_platform_data(void *info)
+{
+ int ret;
+
+ ret = bcm43xx_regulator_register();
+ if (ret)
+ return NULL;
+
+ pr_info("Using generic wifi platform data\n");
+
+ /* For now it's empty */
+ return NULL;
+}
+
+static const struct devs_id bcm43xx_clk_vmmc_dev_id __initconst = {
+ .name = "bcm43xx_clk_vmmc",
+ .type = SFI_DEV_TYPE_SD,
+ .get_platform_data = &bcm43xx_platform_data,
+};
+
+sfi_device(bcm43xx_clk_vmmc_dev_id);
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
new file mode 100644
index 000000000000..00c4a034ad93
--- /dev/null
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mrfld_sd.c
@@ -0,0 +1,47 @@
+/*
+ * SDHCI platform data initilisation file
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/init.h>
+#include <linux/pci.h>
+
+#include <linux/mmc/sdhci-pci-data.h>
+
+#include <asm/intel-mid.h>
+
+#define INTEL_MRFLD_SD 2
+#define INTEL_MRFLD_SD_CD_GPIO 77
+
+static struct sdhci_pci_data mrfld_sdhci_pci_data = {
+ .rst_n_gpio = -EINVAL,
+ .cd_gpio = INTEL_MRFLD_SD_CD_GPIO,
+};
+
+static struct sdhci_pci_data *
+mrfld_sdhci_pci_get_data(struct pci_dev *pdev, int slotno)
+{
+ unsigned int func = PCI_FUNC(pdev->devfn);
+
+ if (func == INTEL_MRFLD_SD)
+ return &mrfld_sdhci_pci_data;
+
+ return NULL;
+}
+
+static int __init mrfld_sd_init(void)
+{
+ if (intel_mid_identify_cpu() != INTEL_MID_CPU_CHIP_TANGIER)
+ return -ENODEV;
+
+ sdhci_pci_get_data = mrfld_sdhci_pci_get_data;
+ return 0;
+}
+arch_initcall(mrfld_sd_init);
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index ce119d2ba0d0..7850128f0026 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -70,6 +70,11 @@ EXPORT_SYMBOL_GPL(__intel_mid_cpu_chip);
static void intel_mid_power_off(void)
{
+ /* Shut down South Complex via PWRMU */
+ intel_mid_pwr_power_off();
+
+ /* Only for Tangier, the rest will ignore this command */
+ intel_scu_ipc_simple_command(IPCMSG_COLD_OFF, 1);
};
static void intel_mid_reboot(void)
diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c
index c901a3423772..5d3b45ad1c03 100644
--- a/arch/x86/platform/intel-mid/pwr.c
+++ b/arch/x86/platform/intel-mid/pwr.c
@@ -44,7 +44,19 @@
/* Bits in PM_CMD */
#define PM_CMD_CMD(x) ((x) << 0)
#define PM_CMD_IOC (1 << 8)
-#define PM_CMD_D3cold (1 << 21)
+#define PM_CMD_CM_NOP (0 << 9)
+#define PM_CMD_CM_IMMEDIATE (1 << 9)
+#define PM_CMD_CM_DELAY (2 << 9)
+#define PM_CMD_CM_TRIGGER (3 << 9)
+
+/* System states */
+#define PM_CMD_SYS_STATE_S5 (5 << 16)
+
+/* Trigger variants */
+#define PM_CMD_CFG_TRIGGER_NC (3 << 19)
+
+/* Message to wait for TRIGGER_NC case */
+#define TRIGGER_NC_MSG_2 (2 << 22)
/* List of commands */
#define CMD_SET_CFG 0x01
@@ -137,7 +149,7 @@ static int mid_pwr_wait(struct mid_pwr *pwr)
static int mid_pwr_wait_for_cmd(struct mid_pwr *pwr, u8 cmd)
{
- writel(PM_CMD_CMD(cmd), pwr->regs + PM_CMD);
+ writel(PM_CMD_CMD(cmd) | PM_CMD_CM_IMMEDIATE, pwr->regs + PM_CMD);
return mid_pwr_wait(pwr);
}
@@ -260,6 +272,20 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state)
}
EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state);
+void intel_mid_pwr_power_off(void)
+{
+ struct mid_pwr *pwr = midpwr;
+ u32 cmd = PM_CMD_SYS_STATE_S5 |
+ PM_CMD_CMD(CMD_SET_CFG) |
+ PM_CMD_CM_TRIGGER |
+ PM_CMD_CFG_TRIGGER_NC |
+ TRIGGER_NC_MSG_2;
+
+ /* Send command to SCU */
+ writel(cmd, pwr->regs + PM_CMD);
+ mid_pwr_wait(pwr);
+}
+
int intel_mid_pwr_get_lss_id(struct pci_dev *pdev)
{
int vndr;
@@ -354,7 +380,7 @@ static int mid_pwr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return 0;
}
-static int mid_set_initial_state(struct mid_pwr *pwr)
+static int mid_set_initial_state(struct mid_pwr *pwr, const u32 *states)
{
unsigned int i, j;
int ret;
@@ -379,10 +405,10 @@ static int mid_set_initial_state(struct mid_pwr *pwr)
* NOTE: The actual device mapping is provided by a platform at run
* time using vendor capability of PCI configuration space.
*/
- mid_pwr_set_state(pwr, 0, 0xffffffff);
- mid_pwr_set_state(pwr, 1, 0xffffffff);
- mid_pwr_set_state(pwr, 2, 0xffffffff);
- mid_pwr_set_state(pwr, 3, 0xffffffff);
+ mid_pwr_set_state(pwr, 0, states[0]);
+ mid_pwr_set_state(pwr, 1, states[1]);
+ mid_pwr_set_state(pwr, 2, states[2]);
+ mid_pwr_set_state(pwr, 3, states[3]);
/* Send command to SCU */
ret = mid_pwr_wait_for_cmd(pwr, CMD_SET_CFG);
@@ -397,13 +423,41 @@ static int mid_set_initial_state(struct mid_pwr *pwr)
return 0;
}
-static const struct mid_pwr_device_info mid_info = {
- .set_initial_state = mid_set_initial_state,
+static int pnw_set_initial_state(struct mid_pwr *pwr)
+{
+ /* On Penwell SRAM must stay powered on */
+ const u32 states[] = {
+ 0xf00fffff, /* PM_SSC(0) */
+ 0xffffffff, /* PM_SSC(1) */
+ 0xffffffff, /* PM_SSC(2) */
+ 0xffffffff, /* PM_SSC(3) */
+ };
+ return mid_set_initial_state(pwr, states);
+}
+
+static int tng_set_initial_state(struct mid_pwr *pwr)
+{
+ const u32 states[] = {
+ 0xffffffff, /* PM_SSC(0) */
+ 0xffffffff, /* PM_SSC(1) */
+ 0xffffffff, /* PM_SSC(2) */
+ 0xffffffff, /* PM_SSC(3) */
+ };
+ return mid_set_initial_state(pwr, states);
+}
+
+static const struct mid_pwr_device_info pnw_info = {
+ .set_initial_state = pnw_set_initial_state,
+};
+
+static const struct mid_pwr_device_info tng_info = {
+ .set_initial_state = tng_set_initial_state,
};
+/* This table should be in sync with the one in drivers/pci/pci-mid.c */
static const struct pci_device_id mid_pwr_pci_ids[] = {
- { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&mid_info },
- { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&mid_info },
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_PENWELL), (kernel_ulong_t)&pnw_info },
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_TANGIER), (kernel_ulong_t)&tng_info },
{}
};
diff --git a/arch/x86/platform/mellanox/Makefile b/arch/x86/platform/mellanox/Makefile
new file mode 100644
index 000000000000..f43c93188a1d
--- /dev/null
+++ b/arch/x86/platform/mellanox/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_MLX_PLATFORM) += mlx-platform.o
diff --git a/arch/x86/platform/mellanox/mlx-platform.c b/arch/x86/platform/mellanox/mlx-platform.c
new file mode 100644
index 000000000000..7dcfcca97399
--- /dev/null
+++ b/arch/x86/platform/mellanox/mlx-platform.c
@@ -0,0 +1,266 @@
+/*
+ * arch/x86/platform/mellanox/mlx-platform.c
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/device.h>
+#include <linux/dmi.h>
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/i2c-mux-reg.h>
+
+#define MLX_PLAT_DEVICE_NAME "mlxplat"
+
+/* LPC bus IO offsets */
+#define MLXPLAT_CPLD_LPC_I2C_BASE_ADRR 0x2000
+#define MLXPLAT_CPLD_LPC_REG_BASE_ADRR 0x2500
+#define MLXPLAT_CPLD_LPC_IO_RANGE 0x100
+#define MLXPLAT_CPLD_LPC_I2C_CH1_OFF 0xdb
+#define MLXPLAT_CPLD_LPC_I2C_CH2_OFF 0xda
+#define MLXPLAT_CPLD_LPC_PIO_OFFSET 0x10000UL
+#define MLXPLAT_CPLD_LPC_REG1 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \
+ MLXPLAT_CPLD_LPC_I2C_CH1_OFF) | \
+ MLXPLAT_CPLD_LPC_PIO_OFFSET)
+#define MLXPLAT_CPLD_LPC_REG2 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \
+ MLXPLAT_CPLD_LPC_I2C_CH2_OFF) | \
+ MLXPLAT_CPLD_LPC_PIO_OFFSET)
+
+/* Start channel numbers */
+#define MLXPLAT_CPLD_CH1 2
+#define MLXPLAT_CPLD_CH2 10
+
+/* Number of LPC attached MUX platform devices */
+#define MLXPLAT_CPLD_LPC_MUX_DEVS 2
+
+/* mlxplat_priv - platform private data
+ * @pdev_i2c - i2c controller platform device
+ * @pdev_mux - array of mux platform devices
+ */
+struct mlxplat_priv {
+ struct platform_device *pdev_i2c;
+ struct platform_device *pdev_mux[MLXPLAT_CPLD_LPC_MUX_DEVS];
+};
+
+/* Regions for LPC I2C controller and LPC base register space */
+static const struct resource mlxplat_lpc_resources[] = {
+ [0] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_I2C_BASE_ADRR,
+ MLXPLAT_CPLD_LPC_IO_RANGE,
+ "mlxplat_cpld_lpc_i2c_ctrl", IORESOURCE_IO),
+ [1] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_REG_BASE_ADRR,
+ MLXPLAT_CPLD_LPC_IO_RANGE,
+ "mlxplat_cpld_lpc_regs",
+ IORESOURCE_IO),
+};
+
+/* Platform default channels */
+static const int mlxplat_default_channels[][8] = {
+ {
+ MLXPLAT_CPLD_CH1, MLXPLAT_CPLD_CH1 + 1, MLXPLAT_CPLD_CH1 + 2,
+ MLXPLAT_CPLD_CH1 + 3, MLXPLAT_CPLD_CH1 + 4, MLXPLAT_CPLD_CH1 +
+ 5, MLXPLAT_CPLD_CH1 + 6, MLXPLAT_CPLD_CH1 + 7
+ },
+ {
+ MLXPLAT_CPLD_CH2, MLXPLAT_CPLD_CH2 + 1, MLXPLAT_CPLD_CH2 + 2,
+ MLXPLAT_CPLD_CH2 + 3, MLXPLAT_CPLD_CH2 + 4, MLXPLAT_CPLD_CH2 +
+ 5, MLXPLAT_CPLD_CH2 + 6, MLXPLAT_CPLD_CH2 + 7
+ },
+};
+
+/* Platform channels for MSN21xx system family */
+static const int mlxplat_msn21xx_channels[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
+
+/* Platform mux data */
+static struct i2c_mux_reg_platform_data mlxplat_mux_data[] = {
+ {
+ .parent = 1,
+ .base_nr = MLXPLAT_CPLD_CH1,
+ .write_only = 1,
+ .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG1,
+ .reg_size = 1,
+ .idle_in_use = 1,
+ },
+ {
+ .parent = 1,
+ .base_nr = MLXPLAT_CPLD_CH2,
+ .write_only = 1,
+ .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG2,
+ .reg_size = 1,
+ .idle_in_use = 1,
+ },
+
+};
+
+static struct platform_device *mlxplat_dev;
+
+static int __init mlxplat_dmi_default_matched(const struct dmi_system_id *dmi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+ mlxplat_mux_data[i].values = mlxplat_default_channels[i];
+ mlxplat_mux_data[i].n_values =
+ ARRAY_SIZE(mlxplat_default_channels[i]);
+ }
+
+ return 1;
+};
+
+static int __init mlxplat_dmi_msn21xx_matched(const struct dmi_system_id *dmi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+ mlxplat_mux_data[i].values = mlxplat_msn21xx_channels;
+ mlxplat_mux_data[i].n_values =
+ ARRAY_SIZE(mlxplat_msn21xx_channels);
+ }
+
+ return 1;
+};
+
+static struct dmi_system_id mlxplat_dmi_table[] __initdata = {
+ {
+ .callback = mlxplat_dmi_default_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSN24"),
+ },
+ },
+ {
+ .callback = mlxplat_dmi_default_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSN27"),
+ },
+ },
+ {
+ .callback = mlxplat_dmi_default_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSB"),
+ },
+ },
+ {
+ .callback = mlxplat_dmi_default_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSX"),
+ },
+ },
+ {
+ .callback = mlxplat_dmi_msn21xx_matched,
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MSN21"),
+ },
+ },
+ { }
+};
+
+static int __init mlxplat_init(void)
+{
+ struct mlxplat_priv *priv;
+ int i, err;
+
+ if (!dmi_check_system(mlxplat_dmi_table))
+ return -ENODEV;
+
+ mlxplat_dev = platform_device_register_simple(MLX_PLAT_DEVICE_NAME, -1,
+ mlxplat_lpc_resources,
+ ARRAY_SIZE(mlxplat_lpc_resources));
+
+ if (IS_ERR(mlxplat_dev))
+ return PTR_ERR(mlxplat_dev);
+
+ priv = devm_kzalloc(&mlxplat_dev->dev, sizeof(struct mlxplat_priv),
+ GFP_KERNEL);
+ if (!priv) {
+ err = -ENOMEM;
+ goto fail_alloc;
+ }
+ platform_set_drvdata(mlxplat_dev, priv);
+
+ priv->pdev_i2c = platform_device_register_simple("i2c_mlxcpld", -1,
+ NULL, 0);
+ if (IS_ERR(priv->pdev_i2c)) {
+ err = PTR_ERR(priv->pdev_i2c);
+ goto fail_alloc;
+ };
+
+ for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
+ priv->pdev_mux[i] = platform_device_register_resndata(
+ &mlxplat_dev->dev,
+ "i2c-mux-reg", i, NULL,
+ 0, &mlxplat_mux_data[i],
+ sizeof(mlxplat_mux_data[i]));
+ if (IS_ERR(priv->pdev_mux[i])) {
+ err = PTR_ERR(priv->pdev_mux[i]);
+ goto fail_platform_mux_register;
+ }
+ }
+
+ return 0;
+
+fail_platform_mux_register:
+ for (i--; i > 0 ; i--)
+ platform_device_unregister(priv->pdev_mux[i]);
+ platform_device_unregister(priv->pdev_i2c);
+fail_alloc:
+ platform_device_unregister(mlxplat_dev);
+
+ return err;
+}
+module_init(mlxplat_init);
+
+static void __exit mlxplat_exit(void)
+{
+ struct mlxplat_priv *priv = platform_get_drvdata(mlxplat_dev);
+ int i;
+
+ for (i = ARRAY_SIZE(mlxplat_mux_data) - 1; i >= 0 ; i--)
+ platform_device_unregister(priv->pdev_mux[i]);
+
+ platform_device_unregister(priv->pdev_i2c);
+ platform_device_unregister(mlxplat_dev);
+}
+module_exit(mlxplat_exit);
+
+MODULE_AUTHOR("Vadim Pasternak (vadimp@mellanox.com)");
+MODULE_DESCRIPTION("Mellanox platform driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSN24*:");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSN27*:");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSB*:");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSX*:");
+MODULE_ALIAS("dmi:*:*Mellanox*:MSN21*:");
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 23f2f3e41c7f..4a6a5a26c582 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -40,7 +40,15 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
*/
return BIOS_STATUS_UNIMPLEMENTED;
- ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5);
+ /*
+ * If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI
+ * callback method, which uses efi_call() directly, with the kernel page tables:
+ */
+ if (unlikely(test_bit(EFI_OLD_MEMMAP, &efi.flags)))
+ ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5);
+ else
+ ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5);
+
return ret;
}
EXPORT_SYMBOL_GPL(uv_bios_call);
@@ -149,11 +157,8 @@ EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
s64
uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
{
- s64 ret;
-
- ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
- (u64)addr, buf, (u64)len, 0);
- return ret;
+ return uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
+ (u64)addr, buf, (u64)len, 0);
}
EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index fdb4d42b4ce5..9e42842e924a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -24,6 +24,29 @@
#include <asm/irq_vectors.h>
#include <asm/timer.h>
+static struct bau_operations ops;
+
+static struct bau_operations uv123_bau_ops = {
+ .bau_gpa_to_offset = uv_gpa_to_offset,
+ .read_l_sw_ack = read_mmr_sw_ack,
+ .read_g_sw_ack = read_gmmr_sw_ack,
+ .write_l_sw_ack = write_mmr_sw_ack,
+ .write_g_sw_ack = write_gmmr_sw_ack,
+ .write_payload_first = write_mmr_payload_first,
+ .write_payload_last = write_mmr_payload_last,
+};
+
+static struct bau_operations uv4_bau_ops = {
+ .bau_gpa_to_offset = uv_gpa_to_soc_phys_ram,
+ .read_l_sw_ack = read_mmr_proc_sw_ack,
+ .read_g_sw_ack = read_gmmr_proc_sw_ack,
+ .write_l_sw_ack = write_mmr_proc_sw_ack,
+ .write_g_sw_ack = write_gmmr_proc_sw_ack,
+ .write_payload_first = write_mmr_proc_payload_first,
+ .write_payload_last = write_mmr_proc_payload_last,
+};
+
+
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
static int timeout_base_ns[] = {
20,
@@ -55,16 +78,16 @@ static int congested_reps = CONGESTED_REPS;
static int disabled_period = DISABLED_PERIOD;
static struct tunables tunables[] = {
- {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
- {&plugged_delay, PLUGGED_DELAY},
- {&plugsb4reset, PLUGSB4RESET},
- {&timeoutsb4reset, TIMEOUTSB4RESET},
- {&ipi_reset_limit, IPI_RESET_LIMIT},
- {&complete_threshold, COMPLETE_THRESHOLD},
- {&congested_respns_us, CONGESTED_RESPONSE_US},
- {&congested_reps, CONGESTED_REPS},
- {&disabled_period, DISABLED_PERIOD},
- {&giveup_limit, GIVEUP_LIMIT}
+ {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
+ {&plugged_delay, PLUGGED_DELAY},
+ {&plugsb4reset, PLUGSB4RESET},
+ {&timeoutsb4reset, TIMEOUTSB4RESET},
+ {&ipi_reset_limit, IPI_RESET_LIMIT},
+ {&complete_threshold, COMPLETE_THRESHOLD},
+ {&congested_respns_us, CONGESTED_RESPONSE_US},
+ {&congested_reps, CONGESTED_REPS},
+ {&disabled_period, DISABLED_PERIOD},
+ {&giveup_limit, GIVEUP_LIMIT}
};
static struct dentry *tunables_dir;
@@ -216,7 +239,7 @@ static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
msg = mdp->msg;
if (!msg->canceled && do_acknowledge) {
dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
- write_mmr_sw_ack(dw);
+ ops.write_l_sw_ack(dw);
}
msg->replied_to = 1;
msg->swack_vec = 0;
@@ -252,7 +275,7 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
msg->swack_vec) == 0) &&
(msg2->sending_cpu == msg->sending_cpu) &&
(msg2->msg_type != MSG_NOOP)) {
- mmr = read_mmr_sw_ack();
+ mmr = ops.read_l_sw_ack();
msg_res = msg2->swack_vec;
/*
* This is a message retry; clear the resources held
@@ -270,7 +293,7 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
stat->d_canceled++;
cancel_count++;
mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
- write_mmr_sw_ack(mr);
+ ops.write_l_sw_ack(mr);
}
}
}
@@ -403,12 +426,12 @@ static void do_reset(void *ptr)
/*
* only reset the resource if it is still pending
*/
- mmr = read_mmr_sw_ack();
+ mmr = ops.read_l_sw_ack();
msg_res = msg->swack_vec;
mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
if (mmr & msg_res) {
stat->d_rcanceled++;
- write_mmr_sw_ack(mr);
+ ops.write_l_sw_ack(mr);
}
}
}
@@ -580,11 +603,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
*/
static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc)
{
- unsigned long descriptor_status;
-
- descriptor_status =
- ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
- return descriptor_status;
+ return ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
}
/*
@@ -1202,7 +1221,7 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
struct bau_pq_entry *msg = mdp->msg;
struct bau_pq_entry *other_msg;
- mmr_image = read_mmr_sw_ack();
+ mmr_image = ops.read_l_sw_ack();
swack_vec = msg->swack_vec;
if ((swack_vec & mmr_image) == 0) {
@@ -1431,7 +1450,7 @@ static int ptc_seq_show(struct seq_file *file, void *data)
/* destination side statistics */
seq_printf(file,
"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
- read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
+ ops.read_g_sw_ack(uv_cpu_to_pnode(cpu)),
stat->d_requestee, cycles_2_us(stat->d_time),
stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
stat->d_nomsg, stat->d_retries, stat->d_canceled,
@@ -1497,16 +1516,16 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
}
if (kstrtol(optstr, 10, &input_arg) < 0) {
- printk(KERN_DEBUG "%s is invalid\n", optstr);
+ pr_debug("%s is invalid\n", optstr);
return -EINVAL;
}
if (input_arg == 0) {
elements = ARRAY_SIZE(stat_description);
- printk(KERN_DEBUG "# cpu: cpu number\n");
- printk(KERN_DEBUG "Sender statistics:\n");
+ pr_debug("# cpu: cpu number\n");
+ pr_debug("Sender statistics:\n");
for (i = 0; i < elements; i++)
- printk(KERN_DEBUG "%s\n", stat_description[i]);
+ pr_debug("%s\n", stat_description[i]);
} else if (input_arg == -1) {
for_each_present_cpu(cpu) {
stat = &per_cpu(ptcstats, cpu);
@@ -1554,7 +1573,7 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr,
break;
}
if (cnt != e) {
- printk(KERN_INFO "bau tunable error: should be %d values\n", e);
+ pr_info("bau tunable error: should be %d values\n", e);
return -EINVAL;
}
@@ -1571,7 +1590,7 @@ static int parse_tunables_write(struct bau_control *bcp, char *instr,
continue;
}
if (val < 1 || val > bcp->cpus_in_uvhub) {
- printk(KERN_DEBUG
+ pr_debug(
"Error: BAU max concurrent %d is invalid\n",
val);
return -EINVAL;
@@ -1619,17 +1638,17 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
for_each_present_cpu(cpu) {
bcp = &per_cpu(bau_control, cpu);
- bcp->max_concurr = max_concurr;
- bcp->max_concurr_const = max_concurr;
- bcp->plugged_delay = plugged_delay;
- bcp->plugsb4reset = plugsb4reset;
- bcp->timeoutsb4reset = timeoutsb4reset;
- bcp->ipi_reset_limit = ipi_reset_limit;
- bcp->complete_threshold = complete_threshold;
- bcp->cong_response_us = congested_respns_us;
- bcp->cong_reps = congested_reps;
- bcp->disabled_period = sec_2_cycles(disabled_period);
- bcp->giveup_limit = giveup_limit;
+ bcp->max_concurr = max_concurr;
+ bcp->max_concurr_const = max_concurr;
+ bcp->plugged_delay = plugged_delay;
+ bcp->plugsb4reset = plugsb4reset;
+ bcp->timeoutsb4reset = timeoutsb4reset;
+ bcp->ipi_reset_limit = ipi_reset_limit;
+ bcp->complete_threshold = complete_threshold;
+ bcp->cong_response_us = congested_respns_us;
+ bcp->cong_reps = congested_reps;
+ bcp->disabled_period = sec_2_cycles(disabled_period);
+ bcp->giveup_limit = giveup_limit;
}
return count;
}
@@ -1676,21 +1695,21 @@ static int __init uv_ptc_init(void)
proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
&proc_uv_ptc_operations);
if (!proc_uv_ptc) {
- printk(KERN_ERR "unable to create %s proc entry\n",
+ pr_err("unable to create %s proc entry\n",
UV_PTC_BASENAME);
return -EINVAL;
}
tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
if (!tunables_dir) {
- printk(KERN_ERR "unable to create debugfs directory %s\n",
+ pr_err("unable to create debugfs directory %s\n",
UV_BAU_TUNABLES_DIR);
return -EINVAL;
}
tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
tunables_dir, NULL, &tunables_fops);
if (!tunables_file) {
- printk(KERN_ERR "unable to create debugfs file %s\n",
+ pr_err("unable to create debugfs file %s\n",
UV_BAU_TUNABLES_FILE);
return -EINVAL;
}
@@ -1725,7 +1744,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
gpa = uv_gpa(bau_desc);
n = uv_gpa_to_gnode(gpa);
- m = uv_gpa_to_offset(gpa);
+ m = ops.bau_gpa_to_offset(gpa);
if (is_uv1_hub())
uv1 = 1;
@@ -1740,7 +1759,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
memset(bd2, 0, sizeof(struct bau_desc));
if (uv1) {
uv1_hdr = &bd2->header.uv1_hdr;
- uv1_hdr->swack_flag = 1;
+ uv1_hdr->swack_flag = 1;
/*
* The base_dest_nasid set in the message header
* is the nasid of the first uvhub in the partition.
@@ -1749,10 +1768,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
* if nasid striding is being used.
*/
uv1_hdr->base_dest_nasid =
- UV_PNODE_TO_NASID(base_pnode);
- uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID;
- uv1_hdr->command = UV_NET_ENDPOINT_INTD;
- uv1_hdr->int_both = 1;
+ UV_PNODE_TO_NASID(base_pnode);
+ uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv1_hdr->command = UV_NET_ENDPOINT_INTD;
+ uv1_hdr->int_both = 1;
/*
* all others need to be set to zero:
* fairness chaining multilevel count replied_to
@@ -1763,11 +1782,11 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
* uses native mode for selective broadcasts.
*/
uv2_3_hdr = &bd2->header.uv2_3_hdr;
- uv2_3_hdr->swack_flag = 1;
+ uv2_3_hdr->swack_flag = 1;
uv2_3_hdr->base_dest_nasid =
- UV_PNODE_TO_NASID(base_pnode);
- uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID;
- uv2_3_hdr->command = UV_NET_ENDPOINT_INTD;
+ UV_PNODE_TO_NASID(base_pnode);
+ uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv2_3_hdr->command = UV_NET_ENDPOINT_INTD;
}
}
for_each_present_cpu(cpu) {
@@ -1790,10 +1809,7 @@ static void pq_init(int node, int pnode)
size_t plsize;
char *cp;
void *vp;
- unsigned long pn;
- unsigned long first;
- unsigned long pn_first;
- unsigned long last;
+ unsigned long gnode, first, last, tail;
struct bau_pq_entry *pqp;
struct bau_control *bcp;
@@ -1814,17 +1830,25 @@ static void pq_init(int node, int pnode)
bcp->bau_msg_head = pqp;
bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
}
+
+ first = ops.bau_gpa_to_offset(uv_gpa(pqp));
+ last = ops.bau_gpa_to_offset(uv_gpa(pqp + (DEST_Q_SIZE - 1)));
+
/*
- * need the gnode of where the memory was really allocated
+ * Pre UV4, the gnode is required to locate the payload queue
+ * and the payload queue tail must be maintained by the kernel.
*/
- pn = uv_gpa_to_gnode(uv_gpa(pqp));
- first = uv_physnodeaddr(pqp);
- pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
- last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
- write_mmr_payload_first(pnode, pn_first);
- write_mmr_payload_tail(pnode, first);
- write_mmr_payload_last(pnode, last);
- write_gmmr_sw_ack(pnode, 0xffffUL);
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ if (bcp->uvhub_version <= 3) {
+ tail = first;
+ gnode = uv_gpa_to_gnode(uv_gpa(pqp));
+ first = (gnode << UV_PAYLOADQ_GNODE_SHIFT) | tail;
+ write_mmr_payload_tail(pnode, tail);
+ }
+
+ ops.write_payload_first(pnode, first);
+ ops.write_payload_last(pnode, last);
+ ops.write_g_sw_ack(pnode, 0xffffUL);
/* in effect, all msg_type's are set to MSG_NOOP */
memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
@@ -1914,8 +1938,8 @@ static void __init init_per_cpu_tunables(void)
bcp->complete_threshold = complete_threshold;
bcp->cong_response_us = congested_respns_us;
bcp->cong_reps = congested_reps;
- bcp->disabled_period = sec_2_cycles(disabled_period);
- bcp->giveup_limit = giveup_limit;
+ bcp->disabled_period = sec_2_cycles(disabled_period);
+ bcp->giveup_limit = giveup_limit;
spin_lock_init(&bcp->queue_lock);
spin_lock_init(&bcp->uvhub_lock);
spin_lock_init(&bcp->disable_lock);
@@ -1944,7 +1968,7 @@ static int __init get_cpu_topology(int base_pnode,
pnode = uv_cpu_hub_info(cpu)->pnode;
if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
- printk(KERN_EMERG
+ pr_emerg(
"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
return 1;
@@ -1969,7 +1993,7 @@ static int __init get_cpu_topology(int base_pnode,
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
- printk(KERN_EMERG "%d cpus per socket invalid\n",
+ pr_emerg("%d cpus per socket invalid\n",
sdp->num_cpus);
return 1;
}
@@ -2035,15 +2059,17 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
bcp->uvhub_version = 2;
else if (is_uv3_hub())
bcp->uvhub_version = 3;
+ else if (is_uv4_hub())
+ bcp->uvhub_version = 4;
else {
- printk(KERN_EMERG "uvhub version not 1, 2 or 3\n");
+ pr_emerg("uvhub version not 1, 2, 3, or 4\n");
return 1;
}
bcp->uvhub_master = *hmasterp;
bcp->uvhub_cpu = uv_cpu_blade_processor_id(cpu);
if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
- printk(KERN_EMERG "%d cpus per uvhub invalid\n",
+ pr_emerg("%d cpus per uvhub invalid\n",
bcp->uvhub_cpu);
return 1;
}
@@ -2098,7 +2124,8 @@ static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
void *vp;
struct uvhub_desc *uvhub_descs;
- timeout_us = calculate_destination_timeout();
+ if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
+ timeout_us = calculate_destination_timeout();
vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
uvhub_descs = (struct uvhub_desc *)vp;
@@ -2138,6 +2165,15 @@ static int __init uv_bau_init(void)
if (!is_uv_system())
return 0;
+ if (is_uv4_hub())
+ ops = uv4_bau_ops;
+ else if (is_uv3_hub())
+ ops = uv123_bau_ops;
+ else if (is_uv2_hub())
+ ops = uv123_bau_ops;
+ else if (is_uv1_hub())
+ ops = uv123_bau_ops;
+
for_each_possible_cpu(cur_cpu) {
mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
@@ -2153,7 +2189,9 @@ static int __init uv_bau_init(void)
uv_base_pnode = uv_blade_to_pnode(uvhub);
}
- enable_timeouts();
+ /* software timeouts are not supported on UV4 */
+ if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
+ enable_timeouts();
if (init_per_cpu(nuvhubs, uv_base_pnode)) {
set_bau_off();
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index b12c26e2e309..53cace2ec0e2 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -130,7 +130,7 @@ static void __save_processor_state(struct saved_context *ctxt)
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
- ctxt->cr4 = __read_cr4_safe();
+ ctxt->cr4 = __read_cr4();
#ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8();
#endif
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index 1104515d5ad2..1ac76479c266 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val) \
MCE_INJECT_SET(status);
MCE_INJECT_SET(misc);
MCE_INJECT_SET(addr);
+MCE_INJECT_SET(synd);
#define MCE_INJECT_GET(reg) \
static int inj_##reg##_get(void *data, u64 *val) \
@@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val) \
MCE_INJECT_GET(status);
MCE_INJECT_GET(misc);
MCE_INJECT_GET(addr);
+MCE_INJECT_GET(synd);
DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
/*
* Caller needs to be make sure this cpu doesn't disappear
@@ -243,27 +246,27 @@ static void toggle_nb_mca_mst_cpu(u16 nid)
static void prepare_msrs(void *info)
{
- struct mce i_mce = *(struct mce *)info;
- u8 b = i_mce.bank;
+ struct mce m = *(struct mce *)info;
+ u8 b = m.bank;
- wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus);
+ wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
if (boot_cpu_has(X86_FEATURE_SMCA)) {
- if (i_mce.inject_flags == DFR_INT_INJ) {
- wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr);
+ if (m.inject_flags == DFR_INT_INJ) {
+ wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr);
} else {
- wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status);
- wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr);
+ wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr);
}
- wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc);
+ wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc);
+ wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd);
} else {
- wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status);
- wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr);
- wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc);
+ wrmsrl(MSR_IA32_MCx_STATUS(b), m.status);
+ wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr);
+ wrmsrl(MSR_IA32_MCx_MISC(b), m.misc);
}
-
}
static void do_inject(void)
@@ -275,6 +278,9 @@ static void do_inject(void)
if (i_mce.misc)
i_mce.status |= MCI_STATUS_MISCV;
+ if (i_mce.synd)
+ i_mce.status |= MCI_STATUS_SYNDV;
+
if (inj_type == SW_INJ) {
mce_inject_log(&i_mce);
return;
@@ -301,7 +307,9 @@ static void do_inject(void)
* only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for
* Fam10h and later BKDGs.
*/
- if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) {
+ if (static_cpu_has(X86_FEATURE_AMD_DCM) &&
+ b == 4 &&
+ boot_cpu_data.x86 < 0x17) {
toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
cpu = get_nbc_for_node(amd_get_nb_id(cpu));
}
@@ -371,6 +379,9 @@ static const char readme_msg[] =
"\t used for error thresholding purposes and its validity is indicated by\n"
"\t MCi_STATUS[MiscV].\n"
"\n"
+"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n"
+"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n"
+"\n"
"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
"\t associated with the error.\n"
"\n"
@@ -420,6 +431,7 @@ static struct dfs_node {
{ .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
@@ -428,7 +440,7 @@ static struct dfs_node {
static int __init init_mce_inject(void)
{
- int i;
+ unsigned int i;
u64 cap;
rdmsrl(MSR_IA32_MCG_CAP, cap);
@@ -452,26 +464,22 @@ static int __init init_mce_inject(void)
return 0;
err_dfs_add:
- while (--i >= 0)
+ while (i-- > 0)
debugfs_remove(dfs_fls[i].d);
debugfs_remove(dfs_inj);
dfs_inj = NULL;
- return -ENOMEM;
+ return -ENODEV;
}
static void __exit exit_mce_inject(void)
{
- int i;
- for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
- debugfs_remove(dfs_fls[i].d);
+ debugfs_remove_recursive(dfs_inj);
+ dfs_inj = NULL;
memset(&dfs_fls, 0, sizeof(dfs_fls));
-
- debugfs_remove(dfs_inj);
- dfs_inj = NULL;
}
module_init(init_mce_inject);
module_exit(exit_mce_inject);
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 3ee2bb6b440b..e7e7055a8658 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -8,7 +8,7 @@ else
BITS := 64
endif
-obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \
+obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \
ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \
stub_$(BITS).o stub_segv.o \
sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \
diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
index fa4b8b9841ff..b9933eb9274a 100644
--- a/arch/x86/um/checksum_32.S
+++ b/arch/x86/um/checksum_32.S
@@ -27,6 +27,7 @@
#include <asm/errno.h>
#include <asm/asm.h>
+#include <asm/export.h>
/*
* computes a partial checksum, e.g. for TCP/UDP fragments
@@ -214,3 +215,4 @@ csum_partial:
ret
#endif
+ EXPORT_SYMBOL(csum_partial)
diff --git a/arch/x86/um/ksyms.c b/arch/x86/um/ksyms.c
deleted file mode 100644
index 2e8f43ec6214..000000000000
--- a/arch/x86/um/ksyms.c
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <linux/module.h>
-#include <asm/string.h>
-#include <asm/checksum.h>
-
-#ifndef CONFIG_X86_32
-/*XXX: we need them because they would be exported by x86_64 */
-#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
-EXPORT_SYMBOL(memcpy);
-#else
-EXPORT_SYMBOL(__memcpy);
-#endif
-#endif
-EXPORT_SYMBOL(csum_partial);
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index a7ef7b131e25..60a5a5a85505 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -36,7 +36,8 @@ int is_syscall(unsigned long addr)
* slow, but that doesn't matter, since it will be called only
* in case of singlestepping, if copy_from_user failed.
*/
- n = access_process_vm(current, addr, &instr, sizeof(instr), 0);
+ n = access_process_vm(current, addr, &instr, sizeof(instr),
+ FOLL_FORCE);
if (n != sizeof(instr)) {
printk(KERN_ERR "is_syscall : failed to read "
"instruction from 0x%lx\n", addr);
@@ -194,7 +195,7 @@ int peek_user(struct task_struct *child, long addr, long data)
static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
{
- int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int err, n, cpu = task_cpu(child);
struct user_i387_struct fpregs;
err = save_i387_registers(userspace_pid[cpu],
@@ -211,7 +212,7 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
{
- int n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int n, cpu = task_cpu(child);
struct user_i387_struct fpregs;
n = copy_from_user(&fpregs, buf, sizeof(fpregs));
@@ -224,7 +225,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
{
- int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int err, n, cpu = task_cpu(child);
struct user_fxsr_struct fpregs;
err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs);
@@ -240,7 +241,7 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *
static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
{
- int n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int n, cpu = task_cpu(child);
struct user_fxsr_struct fpregs;
n = copy_from_user(&fpregs, buf, sizeof(fpregs));
diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c
index 0b5c184dd5b3..e30202b1716e 100644
--- a/arch/x86/um/ptrace_64.c
+++ b/arch/x86/um/ptrace_64.c
@@ -212,7 +212,8 @@ int is_syscall(unsigned long addr)
* slow, but that doesn't matter, since it will be called only
* in case of singlestepping, if copy_from_user failed.
*/
- n = access_process_vm(current, addr, &instr, sizeof(instr), 0);
+ n = access_process_vm(current, addr, &instr, sizeof(instr),
+ FOLL_FORCE);
if (n != sizeof(instr)) {
printk("is_syscall : failed to read instruction from "
"0x%lx\n", addr);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index b86ebb1a9a7f..c0fdd57da7aa 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -137,8 +137,10 @@ struct shared_info xen_dummy_shared_info;
void *xen_initial_gdt;
RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
-__read_mostly int xen_have_vector_callback;
-EXPORT_SYMBOL_GPL(xen_have_vector_callback);
+
+static int xen_cpu_up_prepare(unsigned int cpu);
+static int xen_cpu_up_online(unsigned int cpu);
+static int xen_cpu_dead(unsigned int cpu);
/*
* Point at some empty memory to start with. We map the real shared_info
@@ -1237,7 +1239,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.write_cr0 = xen_write_cr0,
.read_cr4 = native_read_cr4,
- .read_cr4_safe = native_read_cr4_safe,
.write_cr4 = xen_write_cr4,
#ifdef CONFIG_X86_64
@@ -1520,10 +1521,7 @@ static void __init xen_pvh_early_guest_init(void)
if (!xen_feature(XENFEAT_auto_translated_physmap))
return;
- if (!xen_feature(XENFEAT_hvm_callback_vector))
- return;
-
- xen_have_vector_callback = 1;
+ BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
xen_pvh_early_cpu_init(0, false);
xen_pvh_set_cr_flags(0);
@@ -1539,6 +1537,24 @@ static void __init xen_dom0_set_legacy_features(void)
x86_platform.legacy.rtc = 1;
}
+static int xen_cpuhp_setup(void)
+{
+ int rc;
+
+ rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE,
+ "XEN_HVM_GUEST_PREPARE",
+ xen_cpu_up_prepare, xen_cpu_dead);
+ if (rc >= 0) {
+ rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "XEN_HVM_GUEST_ONLINE",
+ xen_cpu_up_online, NULL);
+ if (rc < 0)
+ cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE);
+ }
+
+ return rc >= 0 ? 0 : rc;
+}
+
/* First C function to be called on Xen boot */
asmlinkage __visible void __init xen_start_kernel(void)
{
@@ -1640,6 +1656,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+ WARN_ON(xen_cpuhp_setup());
+
local_irq_disable();
early_boot_irqs_disabled = true;
@@ -1820,31 +1838,54 @@ static void __init init_hvm_pv_info(void)
xen_domain_type = XEN_HVM_DOMAIN;
}
-static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
+static int xen_cpu_up_prepare(unsigned int cpu)
{
- int cpu = (long)hcpu;
- switch (action) {
- case CPU_UP_PREPARE:
+ int rc;
+
+ if (xen_hvm_domain()) {
+ /*
+ * This can happen if CPU was offlined earlier and
+ * offlining timed out in common_cpu_die().
+ */
+ if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ }
+
if (cpu_acpi_id(cpu) != U32_MAX)
per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
else
per_cpu(xen_vcpu_id, cpu) = cpu;
xen_vcpu_setup(cpu);
- if (xen_have_vector_callback) {
- if (xen_feature(XENFEAT_hvm_safe_pvclock))
- xen_setup_timer(cpu);
- }
- break;
- default:
- break;
}
- return NOTIFY_OK;
+
+ if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_setup_timer(cpu);
+
+ rc = xen_smp_intr_init(cpu);
+ if (rc) {
+ WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n",
+ cpu, rc);
+ return rc;
+ }
+ return 0;
}
-static struct notifier_block xen_hvm_cpu_notifier = {
- .notifier_call = xen_hvm_cpu_notify,
-};
+static int xen_cpu_dead(unsigned int cpu)
+{
+ xen_smp_intr_free(cpu);
+
+ if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock))
+ xen_teardown_timer(cpu);
+
+ return 0;
+}
+
+static int xen_cpu_up_online(unsigned int cpu)
+{
+ xen_init_lock_cpu(cpu);
+ return 0;
+}
#ifdef CONFIG_KEXEC_CORE
static void xen_hvm_shutdown(void)
@@ -1872,10 +1913,10 @@ static void __init xen_hvm_guest_init(void)
xen_panic_handler_init();
- if (xen_feature(XENFEAT_hvm_callback_vector))
- xen_have_vector_callback = 1;
+ BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
+
xen_hvm_smp_init();
- register_cpu_notifier(&xen_hvm_cpu_notifier);
+ WARN_ON(xen_cpuhp_setup());
xen_unplug_emulated_devices();
x86_init.irqs.intr_init = xen_init_IRQ;
xen_hvm_init_time_ops();
@@ -1911,7 +1952,7 @@ bool xen_hvm_need_lapic(void)
return false;
if (!xen_hvm_domain())
return false;
- if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+ if (xen_feature(XENFEAT_hvm_pirqs))
return false;
return true;
}
@@ -1925,6 +1966,45 @@ static void xen_set_cpu_features(struct cpuinfo_x86 *c)
}
}
+static void xen_pin_vcpu(int cpu)
+{
+ static bool disable_pinning;
+ struct sched_pin_override pin_override;
+ int ret;
+
+ if (disable_pinning)
+ return;
+
+ pin_override.pcpu = cpu;
+ ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override);
+
+ /* Ignore errors when removing override. */
+ if (cpu < 0)
+ return;
+
+ switch (ret) {
+ case -ENOSYS:
+ pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n",
+ cpu);
+ disable_pinning = true;
+ break;
+ case -EPERM:
+ WARN(1, "Trying to pin vcpu without having privilege to do so\n");
+ disable_pinning = true;
+ break;
+ case -EINVAL:
+ case -EBUSY:
+ pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n",
+ cpu);
+ break;
+ case 0:
+ break;
+ default:
+ WARN(1, "rc %d while trying to pin vcpu\n", ret);
+ disable_pinning = true;
+ }
+}
+
const struct hypervisor_x86 x86_hyper_xen = {
.name = "Xen",
.detect = xen_platform,
@@ -1933,6 +2013,7 @@ const struct hypervisor_x86 x86_hyper_xen = {
#endif
.x2apic_available = xen_x2apic_para_available,
.set_cpu_features = xen_set_cpu_features,
+ .pin_vcpu = xen_pin_vcpu,
};
EXPORT_SYMBOL(x86_hyper_xen);
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index de4144c24f1c..809b6c812654 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -89,7 +89,7 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
{
- area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+ area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL);
if (area->ptes == NULL)
return -ENOMEM;
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index d37a0c7f82cb..90d1b83cf35f 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -61,7 +61,7 @@ static int check_platform_magic(void)
}
break;
default:
- printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
+ printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version\n");
return XEN_PLATFORM_ERR_PROTOCOL;
}
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 32bdc2c90297..b9fc52556bcc 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -547,8 +547,11 @@ void xen_pmu_init(int cpu)
return;
fail:
- pr_info_once("Could not initialize VPMU for cpu %d, error %d\n",
- cpu, err);
+ if (err == -EOPNOTSUPP || err == -ENOSYS)
+ pr_info_once("VPMU disabled by hypervisor.\n");
+ else
+ pr_info_once("Could not initialize VPMU for cpu %d, error %d\n",
+ cpu, err);
free_pages((unsigned long)xenpmu_data, 0);
}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 176425233e4d..f8960fca0827 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -861,7 +861,7 @@ char * __init xen_memory_setup(void)
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
/*
* Check whether the kernel itself conflicts with the target E820 map.
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 0b4d04c8ab4d..9fa27ceeecfd 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -87,6 +87,12 @@ static void cpu_bringup(void)
cpu_data(cpu).x86_max_cores = 1;
set_cpu_sibling_map(cpu);
+ /*
+ * identify_cpu() may have set logical_pkg_id to -1 due
+ * to incorrect phys_proc_id. Let's re-comupte it.
+ */
+ topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu);
+
xen_setup_cpu_clockevents();
notify_cpu_starting(cpu);
@@ -115,7 +121,7 @@ asmlinkage __visible void cpu_bringup_and_idle(int cpu)
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-static void xen_smp_intr_free(unsigned int cpu)
+void xen_smp_intr_free(unsigned int cpu)
{
if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
@@ -159,7 +165,7 @@ static void xen_smp_intr_free(unsigned int cpu)
per_cpu(xen_pmu_irq, cpu).name = NULL;
}
};
-static int xen_smp_intr_init(unsigned int cpu)
+int xen_smp_intr_init(unsigned int cpu)
{
int rc;
char *resched_name, *callfunc_name, *debug_name, *pmu_name;
@@ -475,8 +481,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
common_cpu_up(cpu, idle);
xen_setup_runstate_info(cpu);
- xen_setup_timer(cpu);
- xen_init_lock_cpu(cpu);
/*
* PV VCPUs are always successfully taken down (see 'while' loop
@@ -495,10 +499,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
xen_pmu_init(cpu);
- rc = xen_smp_intr_init(cpu);
- if (rc)
- return rc;
-
rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
BUG_ON(rc);
@@ -769,47 +769,10 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
xen_init_lock_cpu(0);
}
-static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
-{
- int rc;
-
- /*
- * This can happen if CPU was offlined earlier and
- * offlining timed out in common_cpu_die().
- */
- if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- }
-
- /*
- * xen_smp_intr_init() needs to run before native_cpu_up()
- * so that IPI vectors are set up on the booting CPU before
- * it is marked online in native_cpu_up().
- */
- rc = xen_smp_intr_init(cpu);
- WARN_ON(rc);
- if (!rc)
- rc = native_cpu_up(cpu, tidle);
-
- /*
- * We must initialize the slowpath CPU kicker _after_ the native
- * path has executed. If we initialized it before none of the
- * unlocker IPI kicks would reach the booting CPU as the booting
- * CPU had not set itself 'online' in cpu_online_mask. That mask
- * is checked when IPIs are sent (on HVM at least).
- */
- xen_init_lock_cpu(cpu);
- return rc;
-}
-
void __init xen_hvm_smp_init(void)
{
- if (!xen_have_vector_callback)
- return;
smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
- smp_ops.cpu_up = xen_hvm_cpu_up;
smp_ops.cpu_die = xen_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index 963d62a35c82..c5c16dc4f694 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -1,5 +1,6 @@
#ifndef _XEN_SMP_H
+#ifdef CONFIG_SMP
extern void xen_send_IPI_mask(const struct cpumask *mask,
int vector);
extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
@@ -8,6 +9,18 @@ extern void xen_send_IPI_allbutself(int vector);
extern void xen_send_IPI_all(int vector);
extern void xen_send_IPI_self(int vector);
+extern int xen_smp_intr_init(unsigned int cpu);
+extern void xen_smp_intr_free(unsigned int cpu);
+
+#else /* CONFIG_SMP */
+
+static inline int xen_smp_intr_init(unsigned int cpu)
+{
+ return 0;
+}
+static inline void xen_smp_intr_free(unsigned int cpu) {}
+#endif /* CONFIG_SMP */
+
#ifdef CONFIG_XEN_PVH
extern void xen_pvh_early_cpu_init(int cpu, bool entry);
#else
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index f42e78de1e10..3d6e0064cbfc 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -21,8 +21,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(char *, irq_name);
static bool xen_pvspin = true;
-#ifdef CONFIG_QUEUED_SPINLOCKS
-
#include <asm/qspinlock.h>
static void xen_qlock_kick(int cpu)
@@ -71,207 +69,6 @@ static void xen_qlock_wait(u8 *byte, u8 val)
xen_poll_irq(irq);
}
-#else /* CONFIG_QUEUED_SPINLOCKS */
-
-enum xen_contention_stat {
- TAKEN_SLOW,
- TAKEN_SLOW_PICKUP,
- TAKEN_SLOW_SPURIOUS,
- RELEASED_SLOW,
- RELEASED_SLOW_KICKED,
- NR_CONTENTION_STATS
-};
-
-
-#ifdef CONFIG_XEN_DEBUG_FS
-#define HISTO_BUCKETS 30
-static struct xen_spinlock_stats
-{
- u32 contention_stats[NR_CONTENTION_STATS];
- u32 histo_spin_blocked[HISTO_BUCKETS+1];
- u64 time_blocked;
-} spinlock_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
- u8 ret;
- u8 old = READ_ONCE(zero_stats);
- if (unlikely(old)) {
- ret = cmpxchg(&zero_stats, old, 0);
- /* This ensures only one fellow resets the stat */
- if (ret == old)
- memset(&spinlock_stats, 0, sizeof(spinlock_stats));
- }
-}
-
-static inline void add_stats(enum xen_contention_stat var, u32 val)
-{
- check_zero();
- spinlock_stats.contention_stats[var] += val;
-}
-
-static inline u64 spin_time_start(void)
-{
- return xen_clocksource_read();
-}
-
-static void __spin_time_accum(u64 delta, u32 *array)
-{
- unsigned index = ilog2(delta);
-
- check_zero();
-
- if (index < HISTO_BUCKETS)
- array[index]++;
- else
- array[HISTO_BUCKETS]++;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
- u32 delta = xen_clocksource_read() - start;
-
- __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
- spinlock_stats.time_blocked += delta;
-}
-#else /* !CONFIG_XEN_DEBUG_FS */
-static inline void add_stats(enum xen_contention_stat var, u32 val)
-{
-}
-
-static inline u64 spin_time_start(void)
-{
- return 0;
-}
-
-static inline void spin_time_accum_blocked(u64 start)
-{
-}
-#endif /* CONFIG_XEN_DEBUG_FS */
-
-struct xen_lock_waiting {
- struct arch_spinlock *lock;
- __ticket_t want;
-};
-
-static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
-static cpumask_t waiting_cpus;
-
-__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
-{
- int irq = __this_cpu_read(lock_kicker_irq);
- struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
- int cpu = smp_processor_id();
- u64 start;
- __ticket_t head;
- unsigned long flags;
-
- /* If kicker interrupts not initialized yet, just spin */
- if (irq == -1)
- return;
-
- start = spin_time_start();
-
- /*
- * Make sure an interrupt handler can't upset things in a
- * partially setup state.
- */
- local_irq_save(flags);
- /*
- * We don't really care if we're overwriting some other
- * (lock,want) pair, as that would mean that we're currently
- * in an interrupt context, and the outer context had
- * interrupts enabled. That has already kicked the VCPU out
- * of xen_poll_irq(), so it will just return spuriously and
- * retry with newly setup (lock,want).
- *
- * The ordering protocol on this is that the "lock" pointer
- * may only be set non-NULL if the "want" ticket is correct.
- * If we're updating "want", we must first clear "lock".
- */
- w->lock = NULL;
- smp_wmb();
- w->want = want;
- smp_wmb();
- w->lock = lock;
-
- /* This uses set_bit, which atomic and therefore a barrier */
- cpumask_set_cpu(cpu, &waiting_cpus);
- add_stats(TAKEN_SLOW, 1);
-
- /* clear pending */
- xen_clear_irq_pending(irq);
-
- /* Only check lock once pending cleared */
- barrier();
-
- /*
- * Mark entry to slowpath before doing the pickup test to make
- * sure we don't deadlock with an unlocker.
- */
- __ticket_enter_slowpath(lock);
-
- /* make sure enter_slowpath, which is atomic does not cross the read */
- smp_mb__after_atomic();
-
- /*
- * check again make sure it didn't become free while
- * we weren't looking
- */
- head = READ_ONCE(lock->tickets.head);
- if (__tickets_equal(head, want)) {
- add_stats(TAKEN_SLOW_PICKUP, 1);
- goto out;
- }
-
- /* Allow interrupts while blocked */
- local_irq_restore(flags);
-
- /*
- * If an interrupt happens here, it will leave the wakeup irq
- * pending, which will cause xen_poll_irq() to return
- * immediately.
- */
-
- /* Block until irq becomes pending (or perhaps a spurious wakeup) */
- xen_poll_irq(irq);
- add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq));
-
- local_irq_save(flags);
-
- kstat_incr_irq_this_cpu(irq);
-out:
- cpumask_clear_cpu(cpu, &waiting_cpus);
- w->lock = NULL;
-
- local_irq_restore(flags);
-
- spin_time_accum_blocked(start);
-}
-PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning);
-
-static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
-{
- int cpu;
-
- add_stats(RELEASED_SLOW, 1);
-
- for_each_cpu(cpu, &waiting_cpus) {
- const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
-
- /* Make sure we read lock before want */
- if (READ_ONCE(w->lock) == lock &&
- READ_ONCE(w->want) == next) {
- add_stats(RELEASED_SLOW_KICKED, 1);
- xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
- break;
- }
- }
-}
-#endif /* CONFIG_QUEUED_SPINLOCKS */
-
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
@@ -334,16 +131,12 @@ void __init xen_init_spinlocks(void)
return;
}
printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
-#ifdef CONFIG_QUEUED_SPINLOCKS
+
__pv_init_lock_hash();
pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = xen_qlock_wait;
pv_lock_ops.kick = xen_qlock_kick;
-#else
- pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
- pv_lock_ops.unlock_kick = xen_unlock_kick;
-#endif
}
/*
@@ -372,44 +165,3 @@ static __init int xen_parse_nopvspin(char *arg)
}
early_param("xen_nopvspin", xen_parse_nopvspin);
-#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
-
-static struct dentry *d_spin_debug;
-
-static int __init xen_spinlock_debugfs(void)
-{
- struct dentry *d_xen = xen_init_debugfs();
-
- if (d_xen == NULL)
- return -ENOMEM;
-
- if (!xen_pvspin)
- return 0;
-
- d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
-
- debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
-
- debugfs_create_u32("taken_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW]);
- debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]);
- debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]);
-
- debugfs_create_u32("released_slow", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW]);
- debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
- &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]);
-
- debugfs_create_u64("time_blocked", 0444, d_spin_debug,
- &spinlock_stats.time_blocked);
-
- debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
- spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
-
- return 0;
-}
-fs_initcall(xen_spinlock_debugfs);
-
-#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 67356d29d74d..33d8f6a7829d 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -432,11 +432,6 @@ static void xen_hvm_setup_cpu_clockevents(void)
void __init xen_hvm_init_time_ops(void)
{
- /* vector callback is needed otherwise we cannot receive interrupts
- * on cpu > 0 and at this point we don't know how many cpus are
- * available */
- if (!xen_have_vector_callback)
- return;
if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
"disable pv timer\n");