diff options
Diffstat (limited to 'arch/um/kernel')
-rw-r--r-- | arch/um/kernel/Makefile | 5 | ||||
-rw-r--r-- | arch/um/kernel/dtb.c | 3 | ||||
-rw-r--r-- | arch/um/kernel/dyn.lds.S | 5 | ||||
-rw-r--r-- | arch/um/kernel/initrd.c | 1 | ||||
-rw-r--r-- | arch/um/kernel/ioport.c | 13 | ||||
-rw-r--r-- | arch/um/kernel/irq.c | 194 | ||||
-rw-r--r-- | arch/um/kernel/load_file.c | 4 | ||||
-rw-r--r-- | arch/um/kernel/maccess.c | 19 | ||||
-rw-r--r-- | arch/um/kernel/mem.c | 62 | ||||
-rw-r--r-- | arch/um/kernel/physmem.c | 41 | ||||
-rw-r--r-- | arch/um/kernel/process.c | 40 | ||||
-rw-r--r-- | arch/um/kernel/sigio.c | 26 | ||||
-rw-r--r-- | arch/um/kernel/skas/.gitignore | 2 | ||||
-rw-r--r-- | arch/um/kernel/skas/Makefile | 38 | ||||
-rw-r--r-- | arch/um/kernel/skas/mmu.c | 117 | ||||
-rw-r--r-- | arch/um/kernel/skas/process.c | 4 | ||||
-rw-r--r-- | arch/um/kernel/skas/stub.c | 140 | ||||
-rw-r--r-- | arch/um/kernel/skas/stub_exe.c | 230 | ||||
-rw-r--r-- | arch/um/kernel/skas/stub_exe_embed.S | 11 | ||||
-rw-r--r-- | arch/um/kernel/skas/syscall.c | 11 | ||||
-rw-r--r-- | arch/um/kernel/sysrq.c | 8 | ||||
-rw-r--r-- | arch/um/kernel/time.c | 33 | ||||
-rw-r--r-- | arch/um/kernel/tlb.c | 74 | ||||
-rw-r--r-- | arch/um/kernel/trap.c | 172 | ||||
-rw-r--r-- | arch/um/kernel/um_arch.c | 93 | ||||
-rw-r--r-- | arch/um/kernel/uml.lds.S | 2 |
26 files changed, 883 insertions, 465 deletions
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index f8567b933ffa..b8f4e9281599 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -12,12 +12,12 @@ CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ -DELF_ARCH=$(LDS_ELF_ARCH) \ -DELF_FORMAT=$(LDS_ELF_FORMAT) \ $(LDS_EXTRA) -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ signal.o sysrq.o time.o tlb.o trap.o \ - um_arch.o umid.o maccess.o kmsg_dump.o capflags.o skas/ + um_arch.o umid.o kmsg_dump.o capflags.o skas/ obj-y += load_file.o obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o @@ -25,7 +25,6 @@ obj-$(CONFIG_GPROF) += gprof_syms.o obj-$(CONFIG_OF) += dtb.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o USER_OBJS := config.o diff --git a/arch/um/kernel/dtb.c b/arch/um/kernel/dtb.c index 4954188a6a09..15c342426489 100644 --- a/arch/um/kernel/dtb.c +++ b/arch/um/kernel/dtb.c @@ -17,7 +17,7 @@ void uml_dtb_init(void) area = uml_load_file(dtb, &size); if (area) { - if (!early_init_dt_scan(area)) { + if (!early_init_dt_scan(area, __pa(area))) { pr_err("invalid DTB %s\n", dtb); memblock_free(area, size); return; @@ -31,6 +31,7 @@ void uml_dtb_init(void) static int __init uml_dtb_setup(char *line, int *add) { + *add = 0; dtb = line; return 0; } diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 3385d653ebd0..a36b7918a011 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -116,8 +116,6 @@ SECTIONS .fini_array : { *(.fini_array) } .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.data.* .gnu.linkonce.d.*) SORT(CONSTRUCTORS) @@ -178,3 +176,6 @@ SECTIONS DISCARDS } + +ASSERT(__syscall_stub_end - __syscall_stub_start <= PAGE_SIZE, + "STUB code must not be larger than one page"); diff --git a/arch/um/kernel/initrd.c b/arch/um/kernel/initrd.c index 47b8cb1a1156..99dba827461c 100644 --- a/arch/um/kernel/initrd.c +++ b/arch/um/kernel/initrd.c @@ -34,6 +34,7 @@ int __init read_initrd(void) static int __init uml_initrd_setup(char *line, int *add) { + *add = 0; initrd = line; return 0; } diff --git a/arch/um/kernel/ioport.c b/arch/um/kernel/ioport.c deleted file mode 100644 index 7220615b3beb..000000000000 --- a/arch/um/kernel/ioport.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2021 Intel Corporation - * Author: Johannes Berg <johannes@sipsolutions.net> - */ -#include <asm/iomap.h> -#include <asm-generic/pci_iomap.h> - -void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port, - unsigned int nr) -{ - return NULL; -} diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 534e91797f89..0dfaf96bb7da 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -52,7 +52,7 @@ struct irq_entry { bool sigio_workaround; }; -static DEFINE_SPINLOCK(irq_lock); +static DEFINE_RAW_SPINLOCK(irq_lock); static LIST_HEAD(active_fds); static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); static bool irqs_suspended; @@ -236,7 +236,8 @@ static void _sigio_handler(struct uml_pt_regs *regs, free_irqs(); } -void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { preempt_disable(); _sigio_handler(regs, irqs_suspended); @@ -257,7 +258,7 @@ static struct irq_entry *get_irq_entry_by_fd(int fd) return NULL; } -static void free_irq_entry(struct irq_entry *to_free, bool remove) +static void remove_irq_entry(struct irq_entry *to_free, bool remove) { if (!to_free) return; @@ -265,7 +266,6 @@ static void free_irq_entry(struct irq_entry *to_free, bool remove) if (remove) os_del_epoll_fd(to_free->fd); list_del(&to_free->list); - kfree(to_free); } static bool update_irq_entry(struct irq_entry *entry) @@ -286,17 +286,19 @@ static bool update_irq_entry(struct irq_entry *entry) return false; } -static void update_or_free_irq_entry(struct irq_entry *entry) +static struct irq_entry *update_or_remove_irq_entry(struct irq_entry *entry) { - if (!update_irq_entry(entry)) - free_irq_entry(entry, false); + if (update_irq_entry(entry)) + return NULL; + remove_irq_entry(entry, false); + return entry; } static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, void (*timetravel_handler)(int, int, void *, struct time_travel_event *)) { - struct irq_entry *irq_entry; + struct irq_entry *irq_entry, *to_free = NULL; int err, events = os_event_mask(type); unsigned long flags; @@ -304,9 +306,10 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, if (err < 0) goto out; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); irq_entry = get_irq_entry_by_fd(fd); if (irq_entry) { +already: /* cannot register the same FD twice with the same type */ if (WARN_ON(irq_entry->reg[type].events)) { err = -EALREADY; @@ -316,11 +319,22 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, /* temporarily disable to avoid IRQ-side locking */ os_del_epoll_fd(fd); } else { - irq_entry = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); - if (!irq_entry) { - err = -ENOMEM; - goto out_unlock; + struct irq_entry *new; + + /* don't restore interrupts */ + raw_spin_unlock(&irq_lock); + new = kzalloc(sizeof(*irq_entry), GFP_ATOMIC); + if (!new) { + local_irq_restore(flags); + return -ENOMEM; + } + raw_spin_lock(&irq_lock); + irq_entry = get_irq_entry_by_fd(fd); + if (irq_entry) { + to_free = new; + goto already; } + irq_entry = new; irq_entry->fd = fd; list_add_tail(&irq_entry->list, &active_fds); maybe_sigio_broken(fd); @@ -339,12 +353,11 @@ static int activate_fd(int irq, int fd, enum um_irq_type type, void *dev_id, #endif WARN_ON(!update_irq_entry(irq_entry)); - spin_unlock_irqrestore(&irq_lock, flags); - - return 0; + err = 0; out_unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); out: + kfree(to_free); return err; } @@ -358,19 +371,20 @@ void free_irq_by_fd(int fd) struct irq_entry *to_free; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); to_free = get_irq_entry_by_fd(fd); - free_irq_entry(to_free, true); - spin_unlock_irqrestore(&irq_lock, flags); + remove_irq_entry(to_free, true); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } EXPORT_SYMBOL(free_irq_by_fd); static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) { - struct irq_entry *entry; + struct irq_entry *entry, *to_free = NULL; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type i; @@ -386,12 +400,13 @@ static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) os_del_epoll_fd(entry->fd); reg->events = 0; - update_or_free_irq_entry(entry); + to_free = update_or_remove_irq_entry(entry); goto out; } } out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(to_free); } void deactivate_fd(int fd, int irqnum) @@ -402,7 +417,7 @@ void deactivate_fd(int fd, int irqnum) os_del_epoll_fd(fd); - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); entry = get_irq_entry_by_fd(fd); if (!entry) goto out; @@ -414,9 +429,10 @@ void deactivate_fd(int fd, int irqnum) entry->reg[i].events = 0; } - update_or_free_irq_entry(entry); + entry = update_or_remove_irq_entry(entry); out: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); + kfree(entry); ignore_sigio_fd(fd); } @@ -546,7 +562,7 @@ void um_irqs_suspend(void) irqs_suspended = true; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; bool clear = true; @@ -579,7 +595,7 @@ void um_irqs_suspend(void) !__ignore_sigio_fd(entry->fd); } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); } void um_irqs_resume(void) @@ -588,7 +604,7 @@ void um_irqs_resume(void) unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { if (entry->suspended) { int err = os_set_fd_async(entry->fd); @@ -602,7 +618,7 @@ void um_irqs_resume(void) } } } - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); irqs_suspended = false; send_sigio_to_self(); @@ -613,7 +629,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) struct irq_entry *entry; unsigned long flags; - spin_lock_irqsave(&irq_lock, flags); + raw_spin_lock_irqsave(&irq_lock, flags); list_for_each_entry(entry, &active_fds, list) { enum um_irq_type t; @@ -628,7 +644,7 @@ static int normal_irq_set_wake(struct irq_data *d, unsigned int on) } } unlock: - spin_unlock_irqrestore(&irq_lock, flags); + raw_spin_unlock_irqrestore(&irq_lock, flags); return 0; } #else @@ -675,114 +691,8 @@ void __init init_IRQ(void) os_setup_epoll(); } -/* - * IRQ stack entry and exit: - * - * Unlike i386, UML doesn't receive IRQs on the normal kernel stack - * and switch over to the IRQ stack after some preparation. We use - * sigaltstack to receive signals on a separate stack from the start. - * These two functions make sure the rest of the kernel won't be too - * upset by being on a different stack. The IRQ stack has a - * thread_info structure at the bottom so that current et al continue - * to work. - * - * to_irq_stack copies the current task's thread_info to the IRQ stack - * thread_info and sets the tasks's stack to point to the IRQ stack. - * - * from_irq_stack copies the thread_info struct back (flags may have - * been modified) and resets the task's stack pointer. - * - * Tricky bits - - * - * What happens when two signals race each other? UML doesn't block - * signals with sigprocmask, SA_DEFER, or sa_mask, so a second signal - * could arrive while a previous one is still setting up the - * thread_info. - * - * There are three cases - - * The first interrupt on the stack - sets up the thread_info and - * handles the interrupt - * A nested interrupt interrupting the copying of the thread_info - - * can't handle the interrupt, as the stack is in an unknown state - * A nested interrupt not interrupting the copying of the - * thread_info - doesn't do any setup, just handles the interrupt - * - * The first job is to figure out whether we interrupted stack setup. - * This is done by xchging the signal mask with thread_info->pending. - * If the value that comes back is zero, then there is no setup in - * progress, and the interrupt can be handled. If the value is - * non-zero, then there is stack setup in progress. In order to have - * the interrupt handled, we leave our signal in the mask, and it will - * be handled by the upper handler after it has set up the stack. - * - * Next is to figure out whether we are the outer handler or a nested - * one. As part of setting up the stack, thread_info->real_thread is - * set to non-NULL (and is reset to NULL on exit). This is the - * nesting indicator. If it is non-NULL, then the stack is already - * set up and the handler can run. - */ - -static unsigned long pending_mask; - -unsigned long to_irq_stack(unsigned long *mask_out) +void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc) { - struct thread_info *ti; - unsigned long mask, old; - int nested; - - mask = xchg(&pending_mask, *mask_out); - if (mask != 0) { - /* - * If any interrupts come in at this point, we want to - * make sure that their bits aren't lost by our - * putting our bit in. So, this loop accumulates bits - * until xchg returns the same value that we put in. - * When that happens, there were no new interrupts, - * and pending_mask contains a bit for each interrupt - * that came in. - */ - old = *mask_out; - do { - old |= mask; - mask = xchg(&pending_mask, old); - } while (mask != old); - return 1; - } - - ti = current_thread_info(); - nested = (ti->real_thread != NULL); - if (!nested) { - struct task_struct *task; - struct thread_info *tti; - - task = cpu_tasks[ti->cpu].task; - tti = task_thread_info(task); - - *ti = *tti; - ti->real_thread = tti; - task->stack = ti; - } - - mask = xchg(&pending_mask, 0); - *mask_out |= mask | nested; - return 0; + do_IRQ(SIGCHLD_IRQ, regs); } - -unsigned long from_irq_stack(int nested) -{ - struct thread_info *ti, *to; - unsigned long mask; - - ti = current_thread_info(); - - pending_mask = 1; - - to = ti->real_thread; - current->stack = to; - ti->real_thread = NULL; - *to = *ti; - - mask = xchg(&pending_mask, 0); - return mask & ~1; -} - diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c index 5cecd0e291fb..cb9d178ab7d8 100644 --- a/arch/um/kernel/load_file.c +++ b/arch/um/kernel/load_file.c @@ -48,9 +48,7 @@ void *uml_load_file(const char *filename, unsigned long long *size) return NULL; } - area = memblock_alloc(*size, SMP_CACHE_BYTES); - if (!area) - panic("%s: Failed to allocate %llu bytes\n", __func__, *size); + area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES); if (__uml_load_file(filename, area, *size)) { memblock_free(area, *size); diff --git a/arch/um/kernel/maccess.c b/arch/um/kernel/maccess.c deleted file mode 100644 index 8ccd56813f68..000000000000 --- a/arch/um/kernel/maccess.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2013 Richard Weinberger <richrd@nod.at> - */ - -#include <linux/uaccess.h> -#include <linux/kernel.h> -#include <os.h> - -bool copy_from_kernel_nofault_allowed(const void *src, size_t size) -{ - void *psrc = (void *)rounddown((unsigned long)src, PAGE_SIZE); - - if ((unsigned long)src < PAGE_SIZE || size <= 0) - return false; - if (os_mincore(psrc, size + src - psrc) <= 0) - return false; - return true; -} diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index a5b4fe2ad931..76bec7de81b5 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -6,11 +6,11 @@ #include <linux/stddef.h> #include <linux/module.h> #include <linux/memblock.h> -#include <linux/highmem.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/slab.h> -#include <asm/fixmap.h> +#include <linux/init.h> +#include <asm/sections.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <as-layout.h> @@ -51,14 +51,12 @@ EXPORT_SYMBOL(empty_zero_page); pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* Initialized at boot time, and readonly after that */ -unsigned long long highmem; -EXPORT_SYMBOL(highmem); int kmalloc_ok = 0; /* Used during early boot */ static unsigned long brk_end; -void __init mem_init(void) +void __init arch_mm_preinit(void) { /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); @@ -70,13 +68,16 @@ void __init mem_init(void) map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; - - /* this will put all low memory onto the freelists */ - memblock_free_all(); + min_low_pfn = PFN_UP(__pa(uml_reserved)); max_pfn = max_low_pfn; +} + +void __init mem_init(void) +{ kmalloc_ok = 1; } +#if IS_ENABLED(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) /* * Create a page table and place a pointer to it in a middle page * directory entry. @@ -98,7 +99,7 @@ static void __init one_page_table_init(pmd_t *pmd) static void __init one_md_table_init(pud_t *pud) { -#ifdef CONFIG_3_LEVEL_PGTABLES +#if CONFIG_PGTABLE_LEVELS > 2 pmd_t *pmd_table = (pmd_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); if (!pmd_table) panic("%s: Failed to allocate %lu bytes align=%lx\n", @@ -109,6 +110,19 @@ static void __init one_md_table_init(pud_t *pud) #endif } +static void __init one_ud_table_init(p4d_t *p4d) +{ +#if CONFIG_PGTABLE_LEVELS > 3 + pud_t *pud_table = (pud_t *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); + if (!pud_table) + panic("%s: Failed to allocate %lu bytes align=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE); + + set_p4d(p4d, __p4d(_KERNPG_TABLE + (unsigned long) __pa(pud_table))); + BUG_ON(pud_table != pud_offset(p4d, 0)); +#endif +} + static void __init fixrange_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { @@ -126,6 +140,8 @@ static void __init fixrange_init(unsigned long start, unsigned long end, for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) + one_ud_table_init(p4d); pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) one_md_table_init(pud); @@ -140,7 +156,6 @@ static void __init fixrange_init(unsigned long start, unsigned long end, static void __init fixaddr_user_init( void) { -#ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA long size = FIXADDR_USER_END - FIXADDR_USER_START; pte_t *pte; phys_t p; @@ -162,13 +177,12 @@ static void __init fixaddr_user_init( void) pte = virt_to_kpte(vaddr); pte_set_val(*pte, p, PAGE_READONLY); } -#endif } +#endif void __init paging_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; - unsigned long vaddr; empty_zero_page = (unsigned long *) memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); @@ -179,14 +193,9 @@ void __init paging_init(void) max_zone_pfn[ZONE_NORMAL] = end_iomem >> PAGE_SHIFT; free_area_init(max_zone_pfn); - /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); - +#if IS_ENABLED(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) fixaddr_user_init(); +#endif } /* @@ -202,14 +211,13 @@ void free_initmem(void) pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + pgd_t *pgd = __pgd_alloc(mm, 0); - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + if (pgd) memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } + return pgd; } @@ -237,3 +245,11 @@ static const pgprot_t protection_map[16] = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED }; DECLARE_VM_GET_PAGE_PROT + +void mark_rodata_ro(void) +{ + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); + + os_protect_memory((void *)rodata_start, rodata_end - rodata_start, 1, 0, 0); +} diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index fb2adfb49945..af02b5f9911d 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -22,23 +22,6 @@ static int physmem_fd = -1; unsigned long high_physmem; EXPORT_SYMBOL(high_physmem); -extern unsigned long long physmem_size; - -void __init mem_total_pages(unsigned long physmem, unsigned long iomem, - unsigned long highmem) -{ - unsigned long phys_pages, highmem_pages; - unsigned long iomem_pages, total_pages; - - phys_pages = physmem >> PAGE_SHIFT; - iomem_pages = iomem >> PAGE_SHIFT; - highmem_pages = highmem >> PAGE_SHIFT; - - total_pages = phys_pages + iomem_pages + highmem_pages; - - max_mapnr = total_pages; -} - void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x) { @@ -64,13 +47,12 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, * @reserve_end: end address of the physical kernel memory. * @len: Length of total physical memory that should be mapped/made * available, in bytes. - * @highmem: Number of highmem bytes that should be mapped/made available. * - * Creates an unlinked temporary file of size (len + highmem) and memory maps + * Creates an unlinked temporary file of size (len) and memory maps * it on the last executable image address (uml_reserved). * * The offset is needed as the length of the total physical memory - * (len + highmem) includes the size of the memory used be the executable image, + * (len) includes the size of the memory used be the executable image, * but the mapped-to address is the last address of the executable image * (uml_reserved == end address of executable image). * @@ -78,24 +60,24 @@ void map_memory(unsigned long virt, unsigned long phys, unsigned long len, * of all user space processes/kernel tasks. */ void __init setup_physmem(unsigned long start, unsigned long reserve_end, - unsigned long len, unsigned long long highmem) + unsigned long len) { unsigned long reserve = reserve_end - start; - long map_size = len - reserve; + unsigned long map_size = len - reserve; int err; - if(map_size <= 0) { + if (len <= reserve) { os_warn("Too few physical memory! Needed=%lu, given=%lu\n", reserve, len); exit(1); } - physmem_fd = create_mem_file(len + highmem); + physmem_fd = create_mem_file(len); err = os_map_memory((void *) reserve_end, physmem_fd, reserve, map_size, 1, 1, 1); if (err < 0) { - os_warn("setup_physmem - mapping %ld bytes of memory at 0x%p " + os_warn("setup_physmem - mapping %lu bytes of memory at 0x%p " "failed - errno = %d\n", map_size, (void *) reserve_end, err); exit(1); @@ -107,9 +89,8 @@ void __init setup_physmem(unsigned long start, unsigned long reserve_end, */ os_seek_file(physmem_fd, __pa(__syscall_stub_start)); os_write_file(physmem_fd, __syscall_stub_start, PAGE_SIZE); - os_fsync_file(physmem_fd); - memblock_add(__pa(start), len + highmem); + memblock_add(__pa(start), len); memblock_reserve(__pa(start), reserve); min_low_pfn = PFN_UP(__pa(reserve_end)); @@ -137,10 +118,6 @@ int phys_mapping(unsigned long phys, unsigned long long *offset_out) region = region->next; } } - else if (phys < __pa(end_iomem) + highmem) { - fd = physmem_fd; - *offset_out = phys - iomem_size; - } return fd; } @@ -149,6 +126,8 @@ EXPORT_SYMBOL(phys_mapping); static int __init uml_mem_setup(char *line, int *add) { char *retptr; + + *add = 0; physmem_size = memparse(line,&retptr); return 0; } diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index be2856af6d4c..0cd6fad3d908 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -43,7 +43,8 @@ * cares about its entry, so it's OK if another processor is modifying its * entry. */ -struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { NULL } }; +struct task_struct *cpu_tasks[NR_CPUS]; +EXPORT_SYMBOL(cpu_tasks); void free_stack(unsigned long stack, int order) { @@ -64,7 +65,7 @@ unsigned long alloc_stack(int order, int atomic) static inline void set_current(struct task_struct *task) { - cpu_tasks[task_thread_info(task)->cpu] = ((struct cpu_task) { task }); + cpu_tasks[task_thread_info(task)->cpu] = task; } struct task_struct *__switch_to(struct task_struct *from, struct task_struct *to) @@ -116,7 +117,7 @@ void new_thread_handler(void) * callback returns only if the kernel thread execs a process */ fn(arg); - userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); + userspace(¤t->thread.regs.regs); } /* Called magically, see new_thread_handler above */ @@ -133,7 +134,7 @@ static void fork_handler(void) current->thread.prev_sched = NULL; - userspace(¤t->thread.regs.regs, current_thread_info()->aux_fp_regs); + userspace(¤t->thread.regs.regs); } int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) @@ -187,6 +188,21 @@ void initial_thread_cb(void (*proc)(void *), void *arg) kmalloc_ok = save_kmalloc_ok; } +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + /* init_task is not dynamically sized (missing FPU state) */ + if (unlikely(src == &init_task)) { + memcpy(dst, src, sizeof(init_task)); + memset((void *)dst + sizeof(init_task), 0, + arch_task_struct_size - sizeof(init_task)); + } else { + memcpy(dst, src, arch_task_struct_size); + } + + return 0; +} + void um_idle_sleep(void) { if (time_travel_mode != TT_MODE_OFF) @@ -205,14 +221,6 @@ int __uml_cant_sleep(void) { /* Is in_interrupt() really needed? */ } -int user_context(unsigned long sp) -{ - unsigned long stack; - - stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); - return stack != (unsigned long) current_thread_info(); -} - extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; void do_uml_exitcalls(void) @@ -287,11 +295,3 @@ unsigned long __get_wchan(struct task_struct *p) return 0; } - -int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu) -{ - int cpu = current_thread_info()->cpu; - - return save_i387_registers(userspace_pid[cpu], (unsigned long *) fpu); -} - diff --git a/arch/um/kernel/sigio.c b/arch/um/kernel/sigio.c index 5085a50c3b8c..4fc04742048a 100644 --- a/arch/um/kernel/sigio.c +++ b/arch/um/kernel/sigio.c @@ -8,32 +8,6 @@ #include <os.h> #include <sigio.h> -/* Protected by sigio_lock() called from write_sigio_workaround */ -static int sigio_irq_fd = -1; - -static irqreturn_t sigio_interrupt(int irq, void *data) -{ - char c; - - os_read_file(sigio_irq_fd, &c, sizeof(c)); - return IRQ_HANDLED; -} - -int write_sigio_irq(int fd) -{ - int err; - - err = um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, - 0, "write sigio", NULL); - if (err < 0) { - printk(KERN_ERR "write_sigio_irq : um_request_irq failed, " - "err = %d\n", err); - return -1; - } - sigio_irq_fd = fd; - return 0; -} - /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */ static DEFINE_MUTEX(sigio_mutex); diff --git a/arch/um/kernel/skas/.gitignore b/arch/um/kernel/skas/.gitignore new file mode 100644 index 000000000000..c3409ced0f38 --- /dev/null +++ b/arch/um/kernel/skas/.gitignore @@ -0,0 +1,2 @@ +stub_exe +stub_exe.dbg diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile index 6f86d53e3d69..3384be42691f 100644 --- a/arch/um/kernel/skas/Makefile +++ b/arch/um/kernel/skas/Makefile @@ -3,14 +3,48 @@ # Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) # -obj-y := stub.o mmu.o process.o syscall.o uaccess.o +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \ + stub_exe_embed.o + +# Stub executable + +stub_exe_objs-y := stub_exe.o + +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F) + +# Object file containing the ELF executable +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe + +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE + $(call if_changed,stub_exe) + +$(obj)/stub_exe: OBJCOPYFLAGS := -S +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE + $(call if_changed,objcopy) + +quiet_cmd_stub_exe = STUB_EXE $@ + cmd_stub_exe = $(CC) -nostdlib -o $@ \ + $(filter-out $(UM_GPROF_OPT) $(UM_GCOV_OPT),$(KBUILD_CFLAGS)) $(STUB_EXE_LDFLAGS) \ + $(filter %.o,$^) + +STUB_EXE_LDFLAGS = -Wl,-n -static + +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y) + +# end # stub.o is in the stub, so it can't be built with profiling # GCC hardened also auto-enables -fpic, but we need %ebx so it can't work -> # disable it CFLAGS_stub.o := $(CFLAGS_NO_HARDENING) -UNPROFILE_OBJS := stub.o +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING) + +# Clang will call memset() from __builtin_alloca() when stack variable +# initialization is enabled, which is used in stub_exe.c. +CFLAGS_stub_exe.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized) + +UNPROFILE_OBJS := stub.o stub_exe.o KCOV_INSTRUMENT := n include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 886ed5e65674..849fafa4b54f 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -8,6 +8,7 @@ #include <linux/sched/signal.h> #include <linux/slab.h> +#include <shared/irq_kern.h> #include <asm/pgalloc.h> #include <asm/sections.h> #include <asm/mmu_context.h> @@ -19,6 +20,9 @@ /* Ensure the stub_data struct covers the allocated area */ static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); +spinlock_t mm_list_lock; +struct list_head mm_list; + int init_new_context(struct task_struct *task, struct mm_struct *mm) { struct mm_id *new_id = &mm->context.id; @@ -31,44 +35,22 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) new_id->stack = stack; - block_signals_trace(); - new_id->pid = start_userspace(stack); - unblock_signals_trace(); + scoped_guard(spinlock_irqsave, &mm_list_lock) { + /* Insert into list, used for lookups when the child dies */ + list_add(&mm->context.list, &mm_list); + } - if (new_id->pid < 0) { - ret = new_id->pid; + ret = start_userspace(new_id); + if (ret < 0) goto out_free; - } - /* - * Ensure the new MM is clean and nothing unwanted is mapped. - * - * TODO: We should clear the memory up to STUB_START to ensure there is - * nothing mapped there, i.e. we (currently) have: - * - * |- user memory -|- unused -|- stub -|- unused -| - * ^ TASK_SIZE ^ STUB_START - * - * Meaning we have two unused areas where we may still have valid - * mappings from our internal clone(). That isn't really a problem as - * userspace is not going to access them, but it is definitely not - * correct. - * - * However, we are "lucky" and if rseq is configured, then on 32 bit - * it will fall into the first empty range while on 64 bit it is going - * to use an anonymous mapping in the second range. As such, things - * continue to work for now as long as we don't start unmapping these - * areas. - * - * Change this to STUB_START once we have a clean userspace. - */ - unmap(new_id, 0, TASK_SIZE); + /* Ensure the new MM is clean and nothing unwanted is mapped */ + unmap(new_id, 0, STUB_START); return 0; out_free: - if (new_id->stack != 0) - free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); + free_pages(new_id->stack, ilog2(STUB_DATA_PAGES)); out: return ret; } @@ -82,13 +64,82 @@ void destroy_context(struct mm_struct *mm) * zero, resulting in a kill(0), which will result in the * whole UML suddenly dying. Also, cover negative and * 1 cases, since they shouldn't happen either. + * + * Negative cases happen if the child died unexpectedly. */ - if (mmu->id.pid < 2) { + if (mmu->id.pid >= 0 && mmu->id.pid < 2) { printk(KERN_ERR "corrupt mm_context - pid = %d\n", mmu->id.pid); return; } - os_kill_ptraced_process(mmu->id.pid, 1); + + if (mmu->id.pid > 0) { + os_kill_ptraced_process(mmu->id.pid, 1); + mmu->id.pid = -1; + } + + if (using_seccomp && mmu->id.sock) + os_close_file(mmu->id.sock); free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); + + guard(spinlock_irqsave)(&mm_list_lock); + + list_del(&mm->context.list); +} + +static irqreturn_t mm_sigchld_irq(int irq, void* dev) +{ + struct mm_context *mm_context; + pid_t pid; + + guard(spinlock)(&mm_list_lock); + + while ((pid = os_reap_child()) > 0) { + /* + * A child died, check if we have an MM with the PID. This is + * only relevant in SECCOMP mode (as ptrace will fail anyway). + * + * See wait_stub_done_seccomp for more details. + */ + list_for_each_entry(mm_context, &mm_list, list) { + if (mm_context->id.pid == pid) { + struct stub_data *stub_data; + printk("Unexpectedly lost MM child! Affected tasks will segfault."); + + /* Marks the MM as dead */ + mm_context->id.pid = -1; + + /* + * NOTE: If SMP is implemented, a futex_wake + * needs to be added here. + */ + stub_data = (void *)mm_context->id.stack; + stub_data->futex = FUTEX_IN_KERN; + + /* + * NOTE: Currently executing syscalls by + * affected tasks may finish normally. + */ + break; + } + } + } + + return IRQ_HANDLED; +} + +static int __init init_child_tracking(void) +{ + int err; + + spin_lock_init(&mm_list_lock); + INIT_LIST_HEAD(&mm_list); + + err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL); + if (err < 0) + panic("Failed to register SIGCHLD IRQ: %d", err); + + return 0; } +early_initcall(init_child_tracking) diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c index 68657988c8d1..05dcdc057af9 100644 --- a/arch/um/kernel/skas/process.c +++ b/arch/um/kernel/skas/process.c @@ -22,15 +22,13 @@ static int __init start_kernel_proc(void *unused) { block_signals_trace(); - cpu_tasks[0].task = current; - start_kernel(); return 0; } extern int userspace_pid[]; -extern char cpu0_irqstack[]; +static char cpu0_irqstack[THREAD_SIZE] __aligned(THREAD_SIZE); int __init start_uml(void) { diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c index 5d52ffa682dc..67cab46a602c 100644 --- a/arch/um/kernel/skas/stub.c +++ b/arch/um/kernel/skas/stub.c @@ -5,21 +5,54 @@ #include <sysdep/stub.h> -static __always_inline int syscall_handler(struct stub_data *d) +#include <linux/futex.h> +#include <sys/socket.h> +#include <errno.h> + +/* + * Known security issues + * + * Userspace can jump to this address to execute *any* syscall that is + * permitted by the stub. As we will return afterwards, it can do + * whatever it likes, including: + * - Tricking the kernel into handing out the memory FD + * - Using this memory FD to read/write all physical memory + * - Running in parallel to the kernel processing a syscall + * (possibly creating data races?) + * - Blocking e.g. SIGALRM to avoid time based scheduling + * + * To avoid this, the permitted location for each syscall needs to be + * checked for in the SECCOMP filter (which is reasonably simple). Also, + * more care will need to go into considerations how the code might be + * tricked by using a prepared stack (or even modifying the stack from + * another thread in case SMP support is added). + * + * As for the SIGALRM, the best counter measure will be to check in the + * kernel that the process is reporting back the SIGALRM in a timely + * fashion. + */ +static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS]) { + struct stub_data *d = get_stub_data(); int i; unsigned long res; + int fd; for (i = 0; i < d->syscall_data_len; i++) { struct stub_syscall *sc = &d->syscall_data[i]; switch (sc->syscall) { case STUB_SYSCALL_MMAP: + if (fd_map) + fd = fd_map[sc->mem.fd]; + else + fd = sc->mem.fd; + res = stub_syscall6(STUB_MMAP_NR, sc->mem.addr, sc->mem.length, sc->mem.prot, MAP_SHARED | MAP_FIXED, - sc->mem.fd, sc->mem.offset); + fd, sc->mem.offset); if (res != sc->mem.addr) { d->err = res; d->syscall_data_len = i; @@ -35,16 +68,6 @@ static __always_inline int syscall_handler(struct stub_data *d) return -1; } break; - case STUB_SYSCALL_MPROTECT: - res = stub_syscall3(__NR_mprotect, - sc->mem.addr, sc->mem.length, - sc->mem.prot); - if (res) { - d->err = res; - d->syscall_data_len = i; - return -1; - } - break; default: d->err = -95; /* EOPNOTSUPP */ d->syscall_data_len = i; @@ -61,9 +84,98 @@ static __always_inline int syscall_handler(struct stub_data *d) void __section(".__syscall_stub") stub_syscall_handler(void) { + syscall_handler(NULL); + + trap_myself(); +} + +void __section(".__syscall_stub") +stub_signal_interrupt(int sig, siginfo_t *info, void *p) +{ struct stub_data *d = get_stub_data(); + char rcv_data; + union { + char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)]; + struct cmsghdr align; + } ctrl = {}; + struct iovec iov = { + .iov_base = &rcv_data, + .iov_len = 1, + }; + struct msghdr msghdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &ctrl, + .msg_controllen = sizeof(ctrl), + }; + ucontext_t *uc = p; + struct cmsghdr *fd_msg; + int *fd_map; + int num_fds; + long res; - syscall_handler(d); + d->signal = sig; + d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0]; + d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0]; - trap_myself(); +restart_wait: + d->futex = FUTEX_IN_KERN; + do { + res = stub_syscall3(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAKE, 1); + } while (res == -EINTR); + + do { + res = stub_syscall4(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAIT, FUTEX_IN_KERN, 0); + } while (res == -EINTR || d->futex == FUTEX_IN_KERN); + + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + if (d->syscall_data_len) { + /* Read passed FDs (if any) */ + do { + res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0); + } while (res == -EINTR); + + /* We should never have a receive error (other than -EAGAIN) */ + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + /* Receive the FDs */ + num_fds = 0; + fd_msg = msghdr.msg_control; + fd_map = (void *)&CMSG_DATA(fd_msg); + if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr)) + num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + /* Try running queued syscalls. */ + res = syscall_handler(fd_map); + + while (num_fds) + stub_syscall2(__NR_close, fd_map[--num_fds], 0); + } else { + res = 0; + } + + if (res < 0 || d->restart_wait) { + /* Report SIGSYS if we restart. */ + d->signal = SIGSYS; + d->restart_wait = 0; + + goto restart_wait; + } + + /* Restore arch dependent state that is not part of the mcontext */ + stub_seccomp_restore_state(&d->arch_data); + + /* Return so that the host modified mcontext is restored. */ +} + +void __section(".__syscall_stub") +stub_signal_restorer(void) +{ + /* We must not have anything on the stack when doing rt_sigreturn */ + stub_syscall0(__NR_rt_sigreturn); } diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c new file mode 100644 index 000000000000..cbafaa684e66 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe.c @@ -0,0 +1,230 @@ +#include <sys/ptrace.h> +#include <sys/prctl.h> +#include <sys/fcntl.h> +#include <asm/unistd.h> +#include <sysdep/stub.h> +#include <stub-data.h> +#include <linux/filter.h> +#include <linux/seccomp.h> +#include <generated/asm-offsets.h> + +void _start(void); + +noinline static void real_init(void) +{ + struct stub_init_data init_data; + unsigned long res; + struct { + void *ss_sp; + int ss_flags; + size_t ss_size; + } stack = { + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + }; + struct { + void *sa_handler_; + unsigned long sa_flags; + void *sa_restorer; + unsigned long long sa_mask; + } sa = { + /* Need to set SA_RESTORER (but the handler never returns) */ + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, + }; + + /* set a nice name */ + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace"); + + /* Make sure this process dies if the kernel dies */ + stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); + + /* Needed in SECCOMP mode (and safe to do anyway) */ + stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + + /* read information from STDIN and close it */ + res = stub_syscall3(__NR_read, 0, + (unsigned long)&init_data, sizeof(init_data)); + if (res != sizeof(init_data)) + stub_syscall1(__NR_exit, 10); + + /* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */ + if (!init_data.seccomp) + stub_syscall1(__NR_close, 0); + else + stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK); + + /* map stub code + data */ + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_EXEC, MAP_FIXED | MAP_SHARED, + init_data.stub_code_fd, init_data.stub_code_offset); + if (res != init_data.stub_start) + stub_syscall1(__NR_exit, 11); + + res = stub_syscall6(STUB_MMAP_NR, + init_data.stub_start + UM_KERN_PAGE_SIZE, + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, + init_data.stub_data_fd, init_data.stub_data_offset); + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) + stub_syscall1(__NR_exit, 12); + + /* In SECCOMP mode, we only need the signalling FD from now on */ + if (init_data.seccomp) { + res = stub_syscall3(__NR_close_range, 1, ~0U, 0); + if (res != 0) + stub_syscall1(__NR_exit, 13); + } + + /* setup signal stack inside stub data */ + stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); + + /* register signal handlers */ + sa.sa_handler_ = (void *) init_data.signal_handler; + sa.sa_restorer = (void *) init_data.signal_restorer; + if (!init_data.seccomp) { + /* In ptrace mode, the SIGSEGV handler never returns */ + sa.sa_mask = 0; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 14); + } else { + /* SECCOMP mode uses rt_sigreturn, need to mask all signals */ + sa.sa_mask = ~0ULL; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 15); + + res = stub_syscall4(__NR_rt_sigaction, SIGSYS, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 16); + + res = stub_syscall4(__NR_rt_sigaction, SIGALRM, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 17); + + res = stub_syscall4(__NR_rt_sigaction, SIGTRAP, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 18); + + res = stub_syscall4(__NR_rt_sigaction, SIGILL, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 19); + + res = stub_syscall4(__NR_rt_sigaction, SIGFPE, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 20); + } + + /* + * If in seccomp mode, install the SECCOMP filter and trigger a syscall. + * Otherwise set PTRACE_TRACEME and do a SIGSTOP. + */ + if (init_data.seccomp) { + struct sock_filter filter[] = { +#if __BITS_PER_LONG > 32 + /* [0] Load upper 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer) + 4)), + + /* [1] Jump forward 3 instructions if the upper address is not identical */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3), +#endif + /* [2] Load lower 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer))), + + /* [3] Mask out lower bits */ + BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000), + + /* [4] Jump to [6] if the lower bits are not on the expected page */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0), + + /* [5] Trap call, allow */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + + /* [6,7] Check architecture */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, arch)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, + UM_SECCOMP_ARCH_NATIVE, 1, 0), + + /* [8] Kill (for architecture check) */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [9] Load syscall number */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + + /* [10-16] Check against permitted syscalls */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, + 7, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg, + 6, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close, + 5, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR, + 4, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap, + 3, 0), +#ifdef __i386__ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area, + 2, 0), +#else + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl, + 2, 0), +#endif + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, + 1, 0), + + /* [17] Not one of the permitted syscalls */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [18] Permitted call for the stub */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = sizeof(filter) / sizeof(filter[0]), + .filter = filter, + }; + + if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, + (unsigned long)&prog) != 0) + stub_syscall1(__NR_exit, 21); + + /* Fall through, the exit syscall will cause SIGSYS */ + } else { + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + } + + stub_syscall1(__NR_exit, 30); + + __builtin_unreachable(); +} + +__attribute__((naked)) void _start(void) +{ + /* + * Since the stack after exec() starts at the top-most address, + * but that's exactly where we also want to map the stub data + * and code, this must: + * - push the stack by 1 code and STUB_DATA_PAGES data pages + * - call real_init() + * This way, real_init() can use the stack normally, while the + * original stack further down (higher address) will become + * inaccessible after the mmap() calls above. + */ + stub_start(real_init); +} diff --git a/arch/um/kernel/skas/stub_exe_embed.S b/arch/um/kernel/skas/stub_exe_embed.S new file mode 100644 index 000000000000..6d8914fbe8f1 --- /dev/null +++ b/arch/um/kernel/skas/stub_exe_embed.S @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> + +__INITDATA + +SYM_DATA_START(stub_exe_start) + .incbin "arch/um/kernel/skas/stub_exe" +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end) + +__FINIT diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index b09e85279d2b..a5beaea2967e 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -31,6 +31,17 @@ void handle_syscall(struct uml_pt_regs *r) goto out; syscall = UPT_SYSCALL_NR(r); + + /* + * If no time passes, then sched_yield may not actually yield, causing + * broken spinlock implementations in userspace (ASAN) to hang for long + * periods of time. + */ + if ((time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) && + syscall == __NR_sched_yield) + tt_extra_sched_jiffies += 1; + if (syscall >= 0 && syscall < __NR_syscalls) { unsigned long ret = EXECUTE_SYSCALL(syscall, regs); diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 4bb8622dc512..13ee5666668d 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -32,12 +32,6 @@ void show_stack(struct task_struct *task, unsigned long *stack, struct pt_regs *segv_regs = current->thread.segv_regs; int i; - if (!segv_regs && os_is_signal_stack()) { - pr_err("Received SIGSEGV in SIGSEGV handler," - " aborting stack trace!\n"); - return; - } - if (!stack) stack = get_stack_pointer(task, segv_regs); @@ -52,5 +46,5 @@ void show_stack(struct task_struct *task, unsigned long *stack, } printk("%sCall Trace:\n", loglvl); - dump_trace(current, &stackops, (void *)loglvl); + dump_trace(task ?: current, &stackops, (void *)loglvl); } diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 29b27b90581f..ae0fa2173778 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -25,6 +25,8 @@ #include <shared/init.h> #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +#include <linux/sched/clock.h> + enum time_travel_mode time_travel_mode; EXPORT_SYMBOL_GPL(time_travel_mode); @@ -47,6 +49,15 @@ static u16 time_travel_shm_id; static struct um_timetravel_schedshm *time_travel_shm; static union um_timetravel_schedshm_client *time_travel_shm_client; +unsigned long tt_extra_sched_jiffies; + +notrace unsigned long long sched_clock(void) +{ + return (unsigned long long)(jiffies - INITIAL_JIFFIES + + tt_extra_sched_jiffies) + * (NSEC_PER_SEC / HZ); +} + static void time_travel_set_time(unsigned long long ns) { if (unlikely(ns < time_travel_time)) @@ -443,6 +454,11 @@ static void time_travel_periodic_timer(struct time_travel_event *e) { time_travel_add_event(&time_travel_timer_event, time_travel_time + time_travel_timer_interval); + + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + deliver_alarm(); } @@ -594,6 +610,10 @@ EXPORT_SYMBOL_GPL(time_travel_add_irq_event); static void time_travel_oneshot_timer(struct time_travel_event *e) { + /* clock tick; decrease extra jiffies by keeping sched_clock constant */ + if (tt_extra_sched_jiffies > 0) + tt_extra_sched_jiffies -= 1; + deliver_alarm(); } @@ -836,11 +856,16 @@ static struct clock_event_device timer_clockevent = { static irqreturn_t um_timer(int irq, void *dev) { - if (get_current()->mm != NULL) - { - /* userspace - relay signal, results in correct userspace timers */ + /* + * Interrupt the (possibly) running userspace process, technically this + * should only happen if userspace is currently executing. + * With infinite CPU time-travel, we can only get here when userspace + * is not executing. Do not notify there and avoid spurious scheduling. + */ + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL && + get_current()->mm) os_alarm_process(get_current()->mm->context.id.pid); - } (*timer_clockevent.event_handler)(&timer_clockevent); diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index 548af31d4111..cf7e0d4407f2 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -23,9 +23,6 @@ struct vm_ops { int phys_fd, unsigned long long offset); int (*unmap)(struct mm_id *mm_idp, unsigned long virt, unsigned long len); - int (*mprotect)(struct mm_id *mm_idp, - unsigned long virt, unsigned long len, - unsigned int prot); }; static int kern_map(struct mm_id *mm_idp, @@ -44,15 +41,6 @@ static int kern_unmap(struct mm_id *mm_idp, return os_unmap_memory((void *)virt, len); } -static int kern_mprotect(struct mm_id *mm_idp, - unsigned long virt, unsigned long len, - unsigned int prot) -{ - return os_protect_memory((void *)virt, len, - prot & UM_PROT_READ, prot & UM_PROT_WRITE, - 1); -} - void report_enomem(void) { printk(KERN_ERR "UML ran out of memory on the host side! " @@ -65,33 +53,37 @@ static inline int update_pte_range(pmd_t *pmd, unsigned long addr, struct vm_ops *ops) { pte_t *pte; - int r, w, x, prot, ret = 0; + int ret = 0; pte = pte_offset_kernel(pmd, addr); do { - r = pte_read(*pte); - w = pte_write(*pte); - x = pte_exec(*pte); - if (!pte_young(*pte)) { - r = 0; - w = 0; - } else if (!pte_dirty(*pte)) - w = 0; - - prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) | - (x ? UM_PROT_EXEC : 0)); - if (pte_newpage(*pte)) { - if (pte_present(*pte)) { - __u64 offset; - unsigned long phys = pte_val(*pte) & PAGE_MASK; - int fd = phys_mapping(phys, &offset); - - ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, - prot, fd, offset); - } else - ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); - } else if (pte_newprot(*pte)) - ret = ops->mprotect(ops->mm_idp, addr, PAGE_SIZE, prot); + if (!pte_needsync(*pte)) + continue; + + if (pte_present(*pte)) { + __u64 offset; + unsigned long phys = pte_val(*pte) & PAGE_MASK; + int fd = phys_mapping(phys, &offset); + int r, w, x, prot; + + r = pte_read(*pte); + w = pte_write(*pte); + x = pte_exec(*pte); + if (!pte_young(*pte)) { + r = 0; + w = 0; + } else if (!pte_dirty(*pte)) + w = 0; + + prot = (r ? UM_PROT_READ : 0) | + (w ? UM_PROT_WRITE : 0) | + (x ? UM_PROT_EXEC : 0); + + ret = ops->mmap(ops->mm_idp, addr, PAGE_SIZE, + prot, fd, offset); + } else + ret = ops->unmap(ops->mm_idp, addr, PAGE_SIZE); + *pte = pte_mkuptodate(*pte); } while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret)); return ret; @@ -109,7 +101,7 @@ static inline int update_pmd_range(pud_t *pud, unsigned long addr, do { next = pmd_addr_end(addr, end); if (!pmd_present(*pmd)) { - if (pmd_newpage(*pmd)) { + if (pmd_needsync(*pmd)) { ret = ops->unmap(ops->mm_idp, addr, next - addr); pmd_mkuptodate(*pmd); @@ -132,7 +124,7 @@ static inline int update_pud_range(p4d_t *p4d, unsigned long addr, do { next = pud_addr_end(addr, end); if (!pud_present(*pud)) { - if (pud_newpage(*pud)) { + if (pud_needsync(*pud)) { ret = ops->unmap(ops->mm_idp, addr, next - addr); pud_mkuptodate(*pud); @@ -155,7 +147,7 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); if (!p4d_present(*p4d)) { - if (p4d_newpage(*p4d)) { + if (p4d_needsync(*p4d)) { ret = ops->unmap(ops->mm_idp, addr, next - addr); p4d_mkuptodate(*p4d); @@ -180,18 +172,16 @@ int um_tlb_sync(struct mm_struct *mm) if (mm == &init_mm) { ops.mmap = kern_map; ops.unmap = kern_unmap; - ops.mprotect = kern_mprotect; } else { ops.mmap = map; ops.unmap = unmap; - ops.mprotect = protect; } pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, mm->context.sync_tlb_range_to); if (!pgd_present(*pgd)) { - if (pgd_newpage(*pgd)) { + if (pgd_needsync(*pgd)) { ret = ops.unmap(ops.mm_idp, addr, next - addr); pgd_mkuptodate(*pgd); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 97c8df9c4401..5b80a3a89c20 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -18,6 +18,122 @@ #include <skas.h> /* + * NOTE: UML does not have exception tables. As such, this is almost a copy + * of the code in mm/memory.c, only adjusting the logic to simply check whether + * we are coming from the kernel instead of doing an additional lookup in the + * exception table. + * We can do this simplification because we never get here if the exception was + * fixable. + */ +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + if (likely(mmap_read_trylock(mm))) + return true; + + if (!is_user) + return false; + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + mmap_read_unlock(mm); + if (!is_user) + return false; + + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalend to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +static struct vm_area_struct * +um_lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, bool is_user) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} + +/* * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by * segv(). */ @@ -43,21 +159,10 @@ int handle_page_fault(unsigned long address, unsigned long ip, if (is_user) flags |= FAULT_FLAG_USER; retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - if (!vma) - goto out; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out; - if (is_user && !ARCH_IS_STACKGROW(address)) - goto out; - vma = expand_stack(mm, address); + vma = um_lock_mm_and_find_vma(mm, address, is_user); if (!vma) goto out_nosemaphore; -good_area: *code_out = SEGV_ACCERR; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) @@ -175,12 +280,14 @@ void fatal_sigsegv(void) * @sig: the signal number * @unused_si: the signal info struct; unused in this handler * @regs: the ptrace register information + * @mc: the mcontext of the signal * * The handler first extracts the faultinfo from the UML ptrace regs struct. * If the userfault did not happen in an UML userspace process, bad_segv is called. * Otherwise the signal did happen in a cloned userspace process, handle it. */ -void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { struct faultinfo * fi = UPT_FAULTINFO(regs); @@ -189,7 +296,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) bad_segv(*fi, UPT_IP(regs)); return; } - segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs); + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); } /* @@ -199,9 +306,8 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) * give us bad data! */ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, - struct uml_pt_regs *regs) + struct uml_pt_regs *regs, void *mc) { - jmp_buf *catcher; int si_code; int err; int is_write = FAULT_WRITE(fi); @@ -223,6 +329,19 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, panic("Failed to sync kernel TLBs: %d", err); goto out; } + else if (current->pagefault_disabled) { + if (!mc) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with pagefaults disabled but no mcontext"); + } + if (!current->thread.segv_continue) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault without recovery target"); + } + mc_set_rip(mc, current->thread.segv_continue); + current->thread.segv_continue = NULL; + goto out; + } else if (current->mm == NULL) { show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); @@ -246,15 +365,8 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, address = 0; } - catcher = current->thread.fault_catcher; if (!err) goto out; - else if (catcher != NULL) { - current->thread.fault_addr = (void *) address; - UML_LONGJMP(catcher, 1); - } - else if (current->thread.fault_addr != NULL) - panic("fault_addr set but no fault catcher"); else if (!is_user && arch_fixup(ip, regs)) goto out; @@ -282,7 +394,8 @@ out: return 0; } -void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc) { int code, err; if (!UPT_IS_USER(regs)) { @@ -310,15 +423,8 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) } } -void bus_handler(int sig, struct siginfo *si, struct uml_pt_regs *regs) -{ - if (current->thread.fault_catcher != NULL) - UML_LONGJMP(current->thread.fault_catcher, 1); - else - relay_signal(sig, si, regs); -} - -void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { do_IRQ(WINCH_IRQ, regs); } diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 8e594cda6d77..2f5ee045bc7a 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -12,6 +12,7 @@ #include <linux/panic_notifier.h> #include <linux/seq_file.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/utsname.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -65,9 +66,6 @@ struct cpuinfo_um boot_cpu_data = { EXPORT_SYMBOL(boot_cpu_data); -union thread_union cpu0_irqstack - __section(".data..init_irqstack") = - { .thread_info = INIT_THREAD_INFO(init_task) }; /* Changed in setup_arch, which is called in early boot */ static char host_info[(__NEW_UTS_LEN + 1) * 5]; @@ -81,7 +79,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "model name\t: UML\n"); seq_printf(m, "mode\t\t: skas\n"); seq_printf(m, "host\t\t: %s\n", host_info); - seq_printf(m, "fpu\t\t: %s\n", cpu_has(&boot_cpu_data, X86_FEATURE_FPU) ? "yes" : "no"); + seq_printf(m, "fpu\t\t: %s\n", str_yes_no(cpu_has(&boot_cpu_data, X86_FEATURE_FPU))); seq_printf(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL)) @@ -131,7 +129,7 @@ static int have_root __initdata; static int have_console __initdata; /* Set in uml_mem_setup and modified in linux_main */ -long long physmem_size = 64 * 1024 * 1024; +unsigned long long physmem_size = 64 * 1024 * 1024; EXPORT_SYMBOL(physmem_size); static const char *usage_string = @@ -167,19 +165,6 @@ __uml_setup("root=", uml_root_setup, " root=/dev/ubd5\n\n" ); -static int __init no_skas_debug_setup(char *line, int *add) -{ - os_warn("'debug' is not necessary to gdb UML in skas mode - run\n"); - os_warn("'gdb linux'\n"); - - return 0; -} - -__uml_setup("debug", no_skas_debug_setup, -"debug\n" -" this flag is not needed to run gdb on UML in skas mode\n\n" -); - static int __init uml_console_setup(char *line, int *add) { have_console = 1; @@ -257,6 +242,8 @@ static struct notifier_block panic_exit_notifier = { void uml_finishsetup(void) { + cpu_tasks[0] = &init_task; + atomic_notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); @@ -278,7 +265,7 @@ EXPORT_SYMBOL(end_iomem); #define MIN_VMALLOC (32 * 1024 * 1024) -static void parse_host_cpu_flags(char *line) +static void __init parse_host_cpu_flags(char *line) { int i; for (i = 0; i < 32*NCAPINTS; i++) { @@ -286,7 +273,8 @@ static void parse_host_cpu_flags(char *line) set_cpu_cap(&boot_cpu_data, i); } } -static void parse_cache_line(char *line) + +static void __init parse_cache_line(char *line) { long res; char *to_parse = strstr(line, ":"); @@ -302,7 +290,24 @@ static void parse_cache_line(char *line) } } -int __init linux_main(int argc, char **argv) +static unsigned long __init get_top_address(char **envp) +{ + unsigned long top_addr = (unsigned long) &top_addr; + int i; + + /* The earliest variable should be after the program name in ELF */ + for (i = 0; envp[i]; i++) { + if ((unsigned long) envp[i] > top_addr) + top_addr = (unsigned long) envp[i]; + } + + top_addr &= ~(UM_KERN_PAGE_SIZE - 1); + top_addr += UM_KERN_PAGE_SIZE; + + return top_addr; +} + +int __init linux_main(int argc, char **argv, char **envp) { unsigned long avail, diff; unsigned long virtmem_size, max_physmem; @@ -324,20 +329,23 @@ int __init linux_main(int argc, char **argv) if (have_console == 0) add_arg(DEFAULT_COMMAND_LINE_CONSOLE); - host_task_size = os_get_top_address(); - /* reserve a few pages for the stubs (taking care of data alignment) */ - /* align the data portion */ - BUILD_BUG_ON(!is_power_of_2(STUB_DATA_PAGES)); - stub_start = (host_task_size - 1) & ~(STUB_DATA_PAGES * PAGE_SIZE - 1); + host_task_size = get_top_address(envp); + /* reserve a few pages for the stubs */ + stub_start = host_task_size - STUB_DATA_PAGES * PAGE_SIZE; /* another page for the code portion */ stub_start -= PAGE_SIZE; host_task_size = stub_start; + /* Limit TASK_SIZE to what is addressable by the page table */ + task_size = host_task_size; + if (task_size > (unsigned long long) PTRS_PER_PGD * PGDIR_SIZE) + task_size = PTRS_PER_PGD * PGDIR_SIZE; + /* * TASK_SIZE needs to be PGDIR_SIZE aligned or else exit_mmap craps * out */ - task_size = host_task_size & PGDIR_MASK; + task_size = task_size & PGDIR_MASK; /* OS sanity checks that need to happen before the kernel runs */ os_early_checks(); @@ -366,23 +374,18 @@ int __init linux_main(int argc, char **argv) setup_machinename(init_utsname()->machine); - highmem = 0; + physmem_size = (physmem_size + PAGE_SIZE - 1) & PAGE_MASK; iomem_size = (iomem_size + PAGE_SIZE - 1) & PAGE_MASK; - max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; - /* - * Zones have to begin on a 1 << MAX_PAGE_ORDER page boundary, - * so this makes sure that's true for highmem - */ - max_physmem &= ~((1 << (PAGE_SHIFT + MAX_PAGE_ORDER)) - 1); - if (physmem_size + iomem_size > max_physmem) { - highmem = physmem_size + iomem_size - max_physmem; - physmem_size -= highmem; + max_physmem = TASK_SIZE - uml_physmem - iomem_size - MIN_VMALLOC; + if (physmem_size > max_physmem) { + physmem_size = max_physmem; + os_info("Physical memory size shrunk to %llu bytes\n", + physmem_size); } high_physmem = uml_physmem + physmem_size; end_iomem = high_physmem + iomem_size; - high_memory = (void *) end_iomem; start_vm = VMALLOC_START; @@ -398,6 +401,8 @@ int __init linux_main(int argc, char **argv) os_info("Kernel virtual memory size shrunk to %lu bytes\n", virtmem_size); + arch_task_struct_size = sizeof(struct task_struct) + host_fp_size; + os_flush_stdout(); return start_uml(); @@ -412,9 +417,8 @@ void __init setup_arch(char **cmdline_p) { u8 rng_seed[32]; - stack_protections((unsigned long) &init_thread_info); - setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); - mem_total_pages(physmem_size, iomem_size, highmem); + stack_protections((unsigned long) init_task.stack); + setup_physmem(uml_physmem, uml_reserved, physmem_size); uml_dtb_init(); read_initrd(); @@ -468,7 +472,12 @@ void *text_poke(void *addr, const void *opcode, size_t len) return memcpy(addr, opcode, len); } -void text_poke_sync(void) +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + return text_poke(addr, opcode, len); +} + +void smp_text_poke_sync_each_cpu(void) { } diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 5c92d58a78e8..a409d4b66114 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -77,8 +77,6 @@ SECTIONS .data : { INIT_TASK_DATA(KERNEL_STACK_SIZE) - . = ALIGN(KERNEL_STACK_SIZE); - *(.data..init_irqstack) DATA_DATA *(.gnu.linkonce.d*) CONSTRUCTORS |