From c78e93630d15b5f5774213aad9bdc9f52473a89b Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Tue, 12 Nov 2013 15:08:15 -0800 Subject: mm: do not walk all of system memory during show_mem It has been reported on very large machines that show_mem is taking almost 5 minutes to display information. This is a serious problem if there is an OOM storm. The bulk of the cost is in show_mem doing a very expensive PFN walk to give us the following information Total RAM: Also available as totalram_pages Highmem pages: Also available as totalhigh_pages Reserved pages: Can be inferred from the zone structure Shared pages: PFN walk required Unshared pages: PFN walk required Quick pages: Per-cpu walk required Only the shared/unshared pages requires a full PFN walk but that information is useless. It is also inaccurate as page pins of unshared pages would be accounted for as shared. Even if the information was accurate, I'm struggling to think how the shared/unshared information could be useful for debugging OOM conditions. Maybe it was useful before rmap existed when reclaiming shared pages was costly but it is less relevant today. The PFN walk could be optimised a bit but why bother as the information is useless. This patch deletes the PFN walker and infers the total RAM, highmem and reserved pages count from struct zone. It omits the shared/unshared page usage on the grounds that it is useless. It also corrects the reporting of HighMem as HighMem/MovableOnly as ZONE_MOVABLE has similar problems to HighMem with respect to lowmem/highmem exhaustion. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Rientjes <rientjes@google.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- lib/show_mem.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/lib/show_mem.c b/lib/show_mem.c index b7c72311ad0..5847a4921b8 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -12,8 +12,7 @@ void show_mem(unsigned int filter) { pg_data_t *pgdat; - unsigned long total = 0, reserved = 0, shared = 0, - nonshared = 0, highmem = 0; + unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); show_free_areas(filter); @@ -22,43 +21,27 @@ void show_mem(unsigned int filter) return; for_each_online_pgdat(pgdat) { - unsigned long i, flags; + unsigned long flags; + int zoneid; pgdat_resize_lock(pgdat, &flags); - for (i = 0; i < pgdat->node_spanned_pages; i++) { - struct page *page; - unsigned long pfn = pgdat->node_start_pfn + i; - - if (unlikely(!(i % MAX_ORDER_NR_PAGES))) - touch_nmi_watchdog(); - - if (!pfn_valid(pfn)) + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) continue; - page = pfn_to_page(pfn); - - if (PageHighMem(page)) - highmem++; + total += zone->present_pages; + reserved = zone->present_pages - zone->managed_pages; - if (PageReserved(page)) - reserved++; - else if (page_count(page) == 1) - nonshared++; - else if (page_count(page) > 1) - shared += page_count(page) - 1; - - total++; + if (is_highmem_idx(zoneid)) + highmem += zone->present_pages; } pgdat_resize_unlock(pgdat, &flags); } printk("%lu pages RAM\n", total); -#ifdef CONFIG_HIGHMEM - printk("%lu pages HighMem\n", highmem); -#endif + printk("%lu pages HighMem/MovableOnly\n", highmem); printk("%lu pages reserved\n", reserved); - printk("%lu pages shared\n", shared); - printk("%lu pages non-shared\n", nonshared); #ifdef CONFIG_QUICKLIST printk("%lu pages in pagetable cache\n", quicklist_total_size()); -- cgit v1.2.3 From 623fd8072c7c4d77a184bc9e35192acf480c18e4 Mon Sep 17 00:00:00 2001 From: Greg Thelen <gthelen@google.com> Date: Tue, 12 Nov 2013 15:08:34 -0800 Subject: percpu: add test module for various percpu operations Tests various percpu operations. Enable with CONFIG_PERCPU_TEST=m. Signed-off-by: Greg Thelen <gthelen@google.com> Acked-by: Tejun Heo <tj@kernel.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- lib/Kconfig.debug | 9 ++++ lib/Makefile | 2 + lib/percpu_test.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 lib/percpu_test.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ebef88f61b7..db25707aa41 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1481,6 +1481,15 @@ config INTERVAL_TREE_TEST help A benchmark measuring the performance of the interval tree library +config PERCPU_TEST + tristate "Per cpu operations test" + depends on m && DEBUG_KERNEL + help + Enable this option to build test module which validates per-cpu + operations. + + If unsure, say N. + config ATOMIC64_SELFTEST bool "Perform an atomic64_t self-test at boot" help diff --git a/lib/Makefile b/lib/Makefile index f3bb2cb98ad..bb016e116ba 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -157,6 +157,8 @@ obj-$(CONFIG_INTERVAL_TREE_TEST) += interval_tree_test.o interval_tree_test-objs := interval_tree_test_main.o interval_tree.o +obj-$(CONFIG_PERCPU_TEST) += percpu_test.o + obj-$(CONFIG_ASN1) += asn1_decoder.o obj-$(CONFIG_FONT_SUPPORT) += fonts/ diff --git a/lib/percpu_test.c b/lib/percpu_test.c new file mode 100644 index 00000000000..0b5d14dadd1 --- /dev/null +++ b/lib/percpu_test.c @@ -0,0 +1,138 @@ +#include <linux/module.h> + +/* validate @native and @pcp counter values match @expected */ +#define CHECK(native, pcp, expected) \ + do { \ + WARN((native) != (expected), \ + "raw %ld (0x%lx) != expected %lld (0x%llx)", \ + (native), (native), \ + (long long)(expected), (long long)(expected)); \ + WARN(__this_cpu_read(pcp) != (expected), \ + "pcp %ld (0x%lx) != expected %lld (0x%llx)", \ + __this_cpu_read(pcp), __this_cpu_read(pcp), \ + (long long)(expected), (long long)(expected)); \ + } while (0) + +static DEFINE_PER_CPU(long, long_counter); +static DEFINE_PER_CPU(unsigned long, ulong_counter); + +static int __init percpu_test_init(void) +{ + /* + * volatile prevents compiler from optimizing it uses, otherwise the + * +ul_one/-ul_one below would replace with inc/dec instructions. + */ + volatile unsigned int ui_one = 1; + long l = 0; + unsigned long ul = 0; + + pr_info("percpu test start\n"); + + preempt_disable(); + + l += -1; + __this_cpu_add(long_counter, -1); + CHECK(l, long_counter, -1); + + l += 1; + __this_cpu_add(long_counter, 1); + CHECK(l, long_counter, 0); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul += 1UL; + __this_cpu_add(ulong_counter, 1UL); + CHECK(ul, ulong_counter, 1); + + ul += -1UL; + __this_cpu_add(ulong_counter, -1UL); + CHECK(ul, ulong_counter, 0); + + ul += -(unsigned long)1; + __this_cpu_add(ulong_counter, -(unsigned long)1); + CHECK(ul, ulong_counter, -1); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul -= 1; + __this_cpu_dec(ulong_counter); + CHECK(ul, ulong_counter, -1); + CHECK(ul, ulong_counter, ULONG_MAX); + + l += -ui_one; + __this_cpu_add(long_counter, -ui_one); + CHECK(l, long_counter, 0xffffffff); + + l += ui_one; + __this_cpu_add(long_counter, ui_one); + CHECK(l, long_counter, (long)0x100000000LL); + + + l = 0; + __this_cpu_write(long_counter, 0); + + l -= ui_one; + __this_cpu_sub(long_counter, ui_one); + CHECK(l, long_counter, -1); + + l = 0; + __this_cpu_write(long_counter, 0); + + l += ui_one; + __this_cpu_add(long_counter, ui_one); + CHECK(l, long_counter, 1); + + l += -ui_one; + __this_cpu_add(long_counter, -ui_one); + CHECK(l, long_counter, (long)0x100000000LL); + + l = 0; + __this_cpu_write(long_counter, 0); + + l -= ui_one; + this_cpu_sub(long_counter, ui_one); + CHECK(l, long_counter, -1); + CHECK(l, long_counter, ULONG_MAX); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul += ui_one; + __this_cpu_add(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 1); + + ul = 0; + __this_cpu_write(ulong_counter, 0); + + ul -= ui_one; + __this_cpu_sub(ulong_counter, ui_one); + CHECK(ul, ulong_counter, -1); + CHECK(ul, ulong_counter, ULONG_MAX); + + ul = 3; + __this_cpu_write(ulong_counter, 3); + + ul = this_cpu_sub_return(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 2); + + ul = __this_cpu_sub_return(ulong_counter, ui_one); + CHECK(ul, ulong_counter, 1); + + preempt_enable(); + + pr_info("percpu test done\n"); + return -EAGAIN; /* Fail will directly unload the module */ +} + +static void __exit percpu_test_exit(void) +{ +} + +module_init(percpu_test_init) +module_exit(percpu_test_exit) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Greg Thelen"); +MODULE_DESCRIPTION("percpu operations test"); -- cgit v1.2.3 From 312b4e226951f707e120b95b118cbc14f3d162b2 Mon Sep 17 00:00:00 2001 From: Ryan Mallon <rmallon@gmail.com> Date: Tue, 12 Nov 2013 15:08:51 -0800 Subject: vsprintf: check real user/group id for %pK Some setuid binaries will allow reading of files which have read permission by the real user id. This is problematic with files which use %pK because the file access permission is checked at open() time, but the kptr_restrict setting is checked at read() time. If a setuid binary opens a %pK file as an unprivileged user, and then elevates permissions before reading the file, then kernel pointer values may be leaked. This happens for example with the setuid pppd application on Ubuntu 12.04: $ head -1 /proc/kallsyms 00000000 T startup_32 $ pppd file /proc/kallsyms pppd: In file /proc/kallsyms: unrecognized option 'c1000000' This will only leak the pointer value from the first line, but other setuid binaries may leak more information. Fix this by adding a check that in addition to the current process having CAP_SYSLOG, that effective user and group ids are equal to the real ids. If a setuid binary reads the contents of a file which uses %pK then the pointer values will be printed as NULL if the real user is unprivileged. Update the sysctl documentation to reflect the changes, and also correct the documentation to state the kptr_restrict=0 is the default. This is a only temporary solution to the issue. The correct solution is to do the permission check at open() time on files, and to replace %pK with a function which checks the open() time permission. %pK uses in printk should be removed since no sane permission check can be done, and instead protected by using dmesg_restrict. Signed-off-by: Ryan Mallon <rmallon@gmail.com> Cc: Kees Cook <keescook@chromium.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Joe Perches <joe@perches.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- Documentation/sysctl/kernel.txt | 25 ++++++++++++++++++------- lib/vsprintf.c | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 4273b2d71a2..26b7ee491df 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -290,13 +290,24 @@ Default value is "/sbin/hotplug". kptr_restrict: This toggle indicates whether restrictions are placed on -exposing kernel addresses via /proc and other interfaces. When -kptr_restrict is set to (0), there are no restrictions. When -kptr_restrict is set to (1), the default, kernel pointers -printed using the %pK format specifier will be replaced with 0's -unless the user has CAP_SYSLOG. When kptr_restrict is set to -(2), kernel pointers printed using %pK will be replaced with 0's -regardless of privileges. +exposing kernel addresses via /proc and other interfaces. + +When kptr_restrict is set to (0), the default, there are no restrictions. + +When kptr_restrict is set to (1), kernel pointers printed using the %pK +format specifier will be replaced with 0's unless the user has CAP_SYSLOG +and effective user and group ids are equal to the real ids. This is +because %pK checks are done at read() time rather than open() time, so +if permissions are elevated between the open() and the read() (e.g via +a setuid binary) then %pK will not leak kernel pointers to unprivileged +users. Note, this is a temporary solution only. The correct long-term +solution is to do the permission checks at open() time. Consider removing +world read permissions from files that use %pK, and using dmesg_restrict +to protect against uses of %pK in dmesg(8) if leaking kernel pointer +values to unprivileged users is a concern. + +When kptr_restrict is set to (2), kernel pointers printed using +%pK will be replaced with 0's regardless of privileges. ============================================================== diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 26559bdb4c4..d76555c45ff 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -27,6 +27,7 @@ #include <linux/uaccess.h> #include <linux/ioport.h> #include <linux/dcache.h> +#include <linux/cred.h> #include <net/addrconf.h> #include <asm/page.h> /* for PAGE_SIZE */ @@ -1312,11 +1313,37 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, spec.field_width = default_width; return string(buf, end, "pK-error", spec); } - if (!((kptr_restrict == 0) || - (kptr_restrict == 1 && - has_capability_noaudit(current, CAP_SYSLOG)))) + + switch (kptr_restrict) { + case 0: + /* Always print %pK values */ + break; + case 1: { + /* + * Only print the real pointer value if the current + * process has CAP_SYSLOG and is running with the + * same credentials it started with. This is because + * access to files is checked at open() time, but %pK + * checks permission at read() time. We don't want to + * leak pointer values if a binary opens a file using + * %pK and then elevates privileges before reading it. + */ + const struct cred *cred = current_cred(); + + if (!has_capability_noaudit(current, CAP_SYSLOG) || + !uid_eq(cred->euid, cred->uid) || + !gid_eq(cred->egid, cred->gid)) + ptr = NULL; + break; + } + case 2: + default: + /* Always print 0's for %pK */ ptr = NULL; + break; + } break; + case 'N': switch (fmt[1]) { case 'F': -- cgit v1.2.3 From d3773ba13c8ca63a4ee9e7926813f6e82be27a6c Mon Sep 17 00:00:00 2001 From: Xie XiuQi <xiexiuqi@huawei.com> Date: Tue, 12 Nov 2013 15:09:49 -0800 Subject: lib/debugobjects.c: remove unnecessary work pending test Remove unnecessary work pending test before calling schedule_work(). It has been tested in queue_work_on() already. No functional changed. Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> Reviewed-by: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- lib/debugobjects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index bf2c8b1043d..e0731c3db70 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -196,7 +196,7 @@ static void free_object(struct debug_obj *obj) * initialized: */ if (obj_pool_free > ODEBUG_POOL_SIZE && obj_cache) - sched = keventd_up() && !work_pending(&debug_obj_work); + sched = keventd_up(); hlist_add_head(&obj->node, &obj_pool); obj_pool_free++; obj_pool_used--; -- cgit v1.2.3 From c0d92a57a88586586b8cb9c7ac149bd10bc40d11 Mon Sep 17 00:00:00 2001 From: Olof Johansson <olof@lixom.net> Date: Tue, 12 Nov 2013 15:09:50 -0800 Subject: lib/vsprintf.c: document formats for dentry and struct file Looks like these were added to Documentation/printk-formats.txt but not the in-file table. Signed-off-by: Olof Johansson <olof@lixom.net> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- lib/vsprintf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d76555c45ff..48586ac3a62 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1219,6 +1219,8 @@ int kptr_restrict __read_mostly; * The maximum supported length is 64 bytes of the input. Consider * to use print_hex_dump() for the larger input. * - 'a' For a phys_addr_t type and its derivative types (passed by reference) + * - 'd[234]' For a dentry name (optionally 2-4 last components) + * - 'D[234]' Same as 'd' but for a struct file * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a -- cgit v1.2.3 From ff6092a8a413f3db1938b4606e078fa751ed6cdb Mon Sep 17 00:00:00 2001 From: Duan Jiong <duanj.fnst@cn.fujitsu.com> Date: Tue, 12 Nov 2013 15:09:51 -0800 Subject: lib/digsig.c: use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)) Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com> Cc: James Morris <james.l.morris@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- lib/digsig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/digsig.c b/lib/digsig.c index 2f31e6a45f0..8793aeda30c 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -209,7 +209,7 @@ int digsig_verify(struct key *keyring, const char *sig, int siglen, kref = keyring_search(make_key_ref(keyring, 1UL), &key_type_user, name); if (IS_ERR(kref)) - key = ERR_PTR(PTR_ERR(kref)); + key = ERR_CAST(kref); else key = key_ref_to_ptr(kref); } else { -- cgit v1.2.3 From 684f0d3d14f2744ae7ad79063b21909e32bf444e Mon Sep 17 00:00:00 2001 From: Nicolin Chen <b42378@freescale.com> Date: Tue, 12 Nov 2013 15:09:52 -0800 Subject: lib/genalloc: add a helper function for DMA buffer allocation When using pool space for DMA buffer, there might be duplicated calling of gen_pool_alloc() and gen_pool_virt_to_phys() in each implementation. Thus it's better to add a simple helper function, a compatible one to the common dma_alloc_coherent(), to save some code. Signed-off-by: Nicolin Chen <b42378@freescale.com> Cc: "Hans J. Koch" <hjk@hansjkoch.de> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Eric Miao <eric.y.miao@gmail.com> Cc: Grant Likely <grant.likely@linaro.org> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Haojian Zhuang <haojian.zhuang@gmail.com> Cc: Jaroslav Kysela <perex@perex.cz> Cc: Kevin Hilman <khilman@deeprootsystems.com> Cc: Liam Girdwood <lgirdwood@gmail.com> Cc: Mark Brown <broonie@kernel.org> Cc: Mauro Carvalho Chehab <m.chehab@samsung.com> Cc: Rob Herring <rob.herring@calxeda.com> Cc: Russell King <linux@arm.linux.org.uk> Cc: Sekhar Nori <nsekhar@ti.com> Cc: Takashi Iwai <tiwai@suse.de> Cc: Vinod Koul <vinod.koul@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/genalloc.h | 2 ++ lib/genalloc.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'lib') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index f8d41cb1cbe..1eda33d7cb1 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -94,6 +94,8 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr, } extern void gen_pool_destroy(struct gen_pool *); extern unsigned long gen_pool_alloc(struct gen_pool *, size_t); +extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, + dma_addr_t *dma); extern void gen_pool_free(struct gen_pool *, unsigned long, size_t); extern void gen_pool_for_each_chunk(struct gen_pool *, void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *); diff --git a/lib/genalloc.c b/lib/genalloc.c index 26cf20be72b..dda31168844 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -312,6 +312,34 @@ retry: } EXPORT_SYMBOL(gen_pool_alloc); +/** + * gen_pool_dma_alloc - allocate special memory from the pool for DMA usage + * @pool: pool to allocate from + * @size: number of bytes to allocate from the pool + * @dma: dma-view physical address + * + * Allocate the requested number of bytes from the specified pool. + * Uses the pool allocation function (with first-fit algorithm by default). + * Can not be used in NMI handler on architectures without + * NMI-safe cmpxchg implementation. + */ +void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma) +{ + unsigned long vaddr; + + if (!pool) + return NULL; + + vaddr = gen_pool_alloc(pool, size); + if (!vaddr) + return NULL; + + *dma = gen_pool_virt_to_phys(pool, vaddr); + + return (void *)vaddr; +} +EXPORT_SYMBOL(gen_pool_dma_alloc); + /** * gen_pool_free - free allocated special memory back to the pool * @pool: pool to free to -- cgit v1.2.3