From 1b9dcf479476dccf260a9533323b3f0e15ee2108 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 8 May 2023 12:46:21 +0300 Subject: lockdep: Swap storage for pin_count and references As a lockmap takes a reference for every ww_mutex used together, this can be an arbitrarily large number and under control of userspace -- easily overflowing the arbitrary limit of 4096. However, the pin_count (used for detecting unexpected lock dropping) is a full 32b despite nesting being extremely rare (see lockdep_pin_lock). References: https://gitlab.freedesktop.org/drm/intel/-/issues/8028 Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20190425092004.9995-33-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi [Joonas: Converting to pin_count:11 as per addition of sync:1] Signed-off-by: Joonas Lahtinen Signed-off-by: Rodrigo Vivi --- include/linux/lockdep_types.h | 4 ++-- kernel/locking/lockdep.c | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index eae115a26488..fd524e8d5061 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -252,8 +252,8 @@ struct held_lock { unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; unsigned int sync:1; - unsigned int references:11; /* 32 bits */ - unsigned int pin_count; + unsigned int pin_count:11; /* 32 bits */ + unsigned int references; }; #else /* !CONFIG_LOCKDEP */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2d4c5bab5af8..b9edc0339273 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5623,11 +5623,14 @@ static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock) if (match_held_lock(hlock, lock)) { /* - * Grab 16bits of randomness; this is sufficient to not - * be guessable and still allows some pin nesting in - * our u32 pin_count. + * Grab 6bits of randomness; this is barely sufficient + * to not be guessable and still allows some 32 levels + * of pin nesting in our u11 pin_count. */ - cookie.val = 1 + (sched_clock() & 0xffff); + cookie.val = 1 + (sched_clock() & 0x3f); + if (DEBUG_LOCKS_WARN_ON(hlock->pin_count + cookie.val >= 1 << 11)) + return NIL_COOKIE; + hlock->pin_count += cookie.val; return cookie; } -- cgit v1.2.3 From 180d590abd94c31f492d599d77cfd98d36cce490 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 3 Sep 2018 14:17:45 +0100 Subject: kernel/panic: Show the stacktrace after additional notifier messages Most systems keep the last messages from the panic, and we value the stacktrace most, so dump it last in order to preserve it for post-mortems. References: https://gitlab.freedesktop.org/drm/intel/-/issues/8030 Signed-off-by: Chris Wilson Acked-by: Martin Peres Link: https://patchwork.freedesktop.org/patch/msgid/20180903131745.30593-1-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi --- kernel/panic.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 72fcbb5a071b..7e1cd220b2cf 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -422,13 +422,6 @@ void vpanic(const char *fmt, va_list args) buf[len - 1] = '\0'; pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE - /* - * Avoid nested stack-dumping if a panic occurs during oops processing - */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) - dump_stack(); -#endif /* * If kgdb is enabled, give it a chance to run before we stop all @@ -460,6 +453,14 @@ void vpanic(const char *fmt, va_list args) sys_info(panic_print); +#ifdef CONFIG_DEBUG_BUGVERBOSE + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + dump_stack(); +#endif + kmsg_dump_desc(KMSG_DUMP_PANIC, buf); /* -- cgit v1.2.3 From 8de6615e41f0fbd7a68d37010ac5dec17d873144 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 9 Oct 2018 12:35:21 +0100 Subject: x86: Downgrade clock throttling thermal event critical error Under CI testing, it is common for the cpus to overheat with the continuous workloads and end up being throttled. As the cpus still function, it is less of a critical error meriting urgent action, but an expected yet significant condition (pr_note). References: https://gitlab.freedesktop.org/drm/intel/-/issues/8031 Signed-off-by: Chris Wilson Cc: Petri Latvala Signed-off-by: Rodrigo Vivi --- drivers/thermal/intel/therm_throt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/intel/therm_throt.c b/drivers/thermal/intel/therm_throt.c index debc94e2dc16..94f393d60f9e 100644 --- a/drivers/thermal/intel/therm_throt.c +++ b/drivers/thermal/intel/therm_throt.c @@ -345,10 +345,10 @@ static void __maybe_unused throttle_active_work(struct work_struct *work) avg /= ARRAY_SIZE(state->temp_samples); if (state->average > avg) { - pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", - this_cpu, - state->level == CORE_LEVEL ? "Core" : "Package", - state->count); + pr_notice("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); state->rate_control_active = true; } -- cgit v1.2.3 From 64a58889d381f49ca544c9feabeead4447b318a0 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Tue, 16 Nov 2021 09:22:48 +0100 Subject: libata: Downgrade unsupported feature warnings to notifications References: https://gitlab.freedesktop.org/drm/intel/-/issues/8032 Signed-off-by: Chris Wilson Cc: Petri Latvala [danvet: Rebase] Signed-off-by: Rodrigo Vivi --- drivers/ata/libata-core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index ff53f5f029b4..8218b1f5c75b 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2210,7 +2210,7 @@ static bool ata_identify_page_supported(struct ata_device *dev, u8 page) * for drives which implement this ATA level or above. */ if (ata_id_major_version(dev->id) >= 10) - ata_dev_warn(dev, + ata_dev_notice(dev, "ATA Identify Device Log not supported\n"); dev->quirks |= ATA_QUIRK_NO_ID_DEV_LOG; return false; @@ -2281,7 +2281,7 @@ static void ata_dev_config_ncq_send_recv(struct ata_device *dev) unsigned int err_mask; if (!ata_log_supported(dev, ATA_LOG_NCQ_SEND_RECV)) { - ata_dev_warn(dev, "NCQ Send/Recv Log not supported\n"); + ata_dev_notice(dev, "NCQ Send/Recv Log not supported\n"); return; } err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV, @@ -2305,8 +2305,8 @@ static void ata_dev_config_ncq_non_data(struct ata_device *dev) unsigned int err_mask; if (!ata_log_supported(dev, ATA_LOG_NCQ_NON_DATA)) { - ata_dev_warn(dev, - "NCQ Non-Data Log not supported\n"); + ata_dev_notice(dev, + "NCQ Non-Data Log not supported\n"); return; } err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA, @@ -2997,14 +2997,14 @@ int ata_dev_configure(struct ata_device *dev) if (ata_id_is_cfa(id)) { /* CPRM may make this media unusable */ if (id[ATA_ID_CFA_KEY_MGMT] & 1) - ata_dev_warn(dev, + ata_dev_notice(dev, "supports DRM functions and may not be fully accessible\n"); snprintf(revbuf, 7, "CFA"); } else { snprintf(revbuf, 7, "ATA-%d", ata_id_major_version(id)); /* Warn the user if the device has TPM extensions */ if (ata_id_has_tpm(id)) - ata_dev_warn(dev, + ata_dev_notice(dev, "supports DRM functions and may not be fully accessible\n"); } @@ -3161,8 +3161,8 @@ int ata_dev_configure(struct ata_device *dev) } if ((dev->quirks & ATA_QUIRK_FIRMWARE_WARN) && print_info) { - ata_dev_warn(dev, "WARNING: device requires firmware update to be fully functional\n"); - ata_dev_warn(dev, " contact the vendor or visit http://ata.wiki.kernel.org\n"); + ata_dev_notice(dev, "WARNING: device requires firmware update to be fully functional\n"); + ata_dev_notice(dev, " contact the vendor or visit http://ata.wiki.kernel.org\n"); } return 0; -- cgit v1.2.3 From d05a7e8cbb44f863afea71233d14073b60a3c02e Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 2 May 2019 22:46:48 +0200 Subject: RFC: hung_task: taint kernel There's the hung_task_panic sysctl, but that's a bit an extreme measure. As a fallback taint at least the machine. Our CI uses this to decide when a reboot is necessary, plus to figure out whether the kernel is still happy. v2: Works much better when I put the else { add_taint() } at the right place. References: https://gitlab.freedesktop.org/drm/intel/-/issues/8034 Signed-off-by: Daniel Vetter Cc: Andrew Morton Cc: Tetsuo Handa Cc: Dmitry Vyukov Cc: "Paul E. McKenney" Cc: Valdis Kletnieks Cc: Daniel Vetter Cc: Vitaly Kuznetsov Cc: "Liu, Chuansheng" Acked-by: Chris Wilson (for core-for-CI) Link: https://patchwork.freedesktop.org/patch/msgid/20190502204648.5537-1-daniel.vetter@ffwll.ch Signed-off-by: Jani Nikula Signed-off-by: Rodrigo Vivi --- kernel/hung_task.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8708a1205f82..2e7c4eea6def 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -223,6 +223,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) console_verbose(); hung_task_show_lock = true; hung_task_call_panic = true; + } else { + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } /* -- cgit v1.2.3 From 82355d5884e92186dcedbe35091c5f1581d10738 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Thu, 2 May 2019 21:42:08 +0200 Subject: RFC: soft/hardlookup: taint kernel There's the soft/hardlookup_panic sysctls, but that's a bit an extreme measure. As a fallback taint at least the machine. Our CI uses this to decide when a reboot is necessary, plus to figure out whether the kernel is still happy. References: https://gitlab.freedesktop.org/drm/intel/-/issues/8035 Signed-off-by: Daniel Vetter Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Valdis Kletnieks Cc: Laurence Oberman Cc: Vincent Whitchurch Cc: Don Zickus Cc: Andrew Morton Cc: Sergey Senozhatsky Cc: Sinan Kaya Cc: Daniel Vetter Acked-by: Chris Wilson (for core-for-CI) Link: https://patchwork.freedesktop.org/patch/msgid/20190502194208.3535-2-daniel.vetter@ffwll.ch Signed-off-by: Jani Nikula Signed-off-by: Rodrigo Vivi --- kernel/watchdog.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 80b56c002c7f..beda49b200bb 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -242,6 +242,8 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs) if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); + else + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); per_cpu(watchdog_hardlockup_warned, cpu) = true; } else { @@ -830,6 +832,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); + else + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } return HRTIMER_RESTART; -- cgit v1.2.3 From 913f84d872a97fed50258a2c1e59c535dd4c41e0 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 8 May 2023 12:53:35 +0300 Subject: net/sch_generic: Shut up noise We can't allow spam in CI. Update 26th June 2018: This is still an issue: Update 23rd May 2019: You guessed it, still ocurring. [ 224.739686] ------------[ cut here ]------------ [ 224.739712] WARNING: CPU: 3 PID: 2982 at net/sched/sch_generic.c:461 dev_watchdog+0x1fd/0x210 [ 224.739714] Modules linked in: vgem snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_pcm i915 asix usbnet mii mei_me mei prime_numbers i2c_hid pinctrl_sunrisepoint pinctrl_intel btusb btrtl btbcm btintel bluetooth ecdh_generic [ 224.739775] CPU: 3 PID: 2982 Comm: gem_exec_suspen Tainted: G U W 4.18.0-rc2-CI-Patchwork_9414+ #1 [ 224.739777] Hardware name: Dell Inc. XPS 13 9350/, BIOS 1.4.12 11/30/2016 [ 224.739780] RIP: 0010:dev_watchdog+0x1fd/0x210 [ 224.739781] Code: 49 63 4c 24 f0 eb 92 4c 89 ef c6 05 21 46 ad 00 01 e8 77 ee fc ff 89 d9 48 89 c2 4c 89 ee 48 c7 c7 88 4c 14 82 e8 a3 fe 84 ff <0f> 0b eb be 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 c7 47 [ 224.739866] RSP: 0018:ffff88027dd83e40 EFLAGS: 00010286 [ 224.739869] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000102 [ 224.739871] RDX: 0000000080000102 RSI: ffffffff820c8c6c RDI: 00000000ffffffff [ 224.739873] RBP: ffff8802644c1540 R08: 0000000071be9b33 R09: 0000000000000000 [ 224.739874] R10: ffff88027dd83dc0 R11: 0000000000000000 R12: ffff8802644c1588 [ 224.739876] R13: ffff8802644c1160 R14: 0000000000000001 R15: ffff88026a5dc728 [ 224.739878] FS: 00007f18f4887980(0000) GS:ffff88027dd80000(0000) knlGS:0000000000000000 [ 224.739880] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 224.739881] CR2: 00007f4c627ae548 CR3: 000000022ca1a002 CR4: 00000000003606e0 [ 224.739883] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 224.739885] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 224.739886] Call Trace: [ 224.739888] [ 224.739892] ? qdisc_reset+0xe0/0xe0 [ 224.739894] ? qdisc_reset+0xe0/0xe0 [ 224.739897] call_timer_fn+0x93/0x360 [ 224.739903] expire_timers+0xc1/0x1d0 [ 224.739908] run_timer_softirq+0xc7/0x170 [ 224.739916] __do_softirq+0xd9/0x505 [ 224.739923] irq_exit+0xa9/0xc0 [ 224.739926] smp_apic_timer_interrupt+0x9c/0x2d0 [ 224.739929] apic_timer_interrupt+0xf/0x20 [ 224.739931] [ 224.739934] RIP: 0010:delay_tsc+0x2e/0xb0 [ 224.739936] Code: 49 89 fc 55 53 bf 01 00 00 00 e8 6d 2c 78 ff e8 88 9d b6 ff 41 89 c5 0f ae e8 0f 31 48 c1 e2 20 48 09 c2 48 89 d5 eb 16 f3 90 01 00 00 00 e8 48 2c 78 ff e8 63 9d b6 ff 44 39 e8 75 36 0f ae [ 224.740021] RSP: 0018:ffffc900002f7d48 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13 [ 224.740024] RAX: 0000000080000000 RBX: 0000000649565ca9 RCX: 0000000000000001 [ 224.740026] RDX: 0000000080000001 RSI: ffffffff820c8c6c RDI: 00000000ffffffff [ 224.740027] RBP: 00000006493ea9ce R08: 000000005e81e2ee R09: 0000000000000000 [ 224.740029] R10: 0000000000000120 R11: 0000000000000000 R12: 00000000002ad8d6 [ 224.740030] R13: 0000000000000003 R14: 0000000000000004 R15: ffff88025caf5408 [ 224.740040] ? delay_tsc+0x66/0xb0 [ 224.740045] hibernation_debug_sleep+0x1c/0x30 [ 224.740048] hibernation_snapshot+0x2c1/0x690 [ 224.740053] hibernate+0x142/0x2a4 [ 224.740057] state_store+0xd0/0xe0 [ 224.740063] kernfs_fop_write+0x104/0x190 [ 224.740068] __vfs_write+0x31/0x180 [ 224.740072] ? rcu_read_lock_sched_held+0x6f/0x80 [ 224.740075] ? rcu_sync_lockdep_assert+0x29/0x50 [ 224.740078] ? __sb_start_write+0x152/0x1f0 [ 224.740080] ? __sb_start_write+0x168/0x1f0 [ 224.740084] vfs_write+0xbd/0x1a0 [ 224.740088] ksys_write+0x50/0xc0 [ 224.740094] do_syscall_64+0x55/0x190 [ 224.740097] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 224.740099] RIP: 0033:0x7f18f400a281 [ 224.740100] Code: c3 0f 1f 84 00 00 00 00 00 48 8b 05 59 8d 20 00 c3 0f 1f 84 00 00 00 00 00 8b 05 8a d1 20 00 85 c0 75 16 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 57 f3 c3 0f 1f 44 00 00 41 54 55 49 89 d4 53 [ 224.740186] RSP: 002b:00007fffd1f4fec8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 224.740189] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f18f400a281 [ 224.740190] RDX: 0000000000000004 RSI: 00007f18f448069a RDI: 0000000000000006 [ 224.740192] RBP: 00007fffd1f4fef0 R08: 0000000000000000 R09: 0000000000000000 [ 224.740194] R10: 0000000000000000 R11: 0000000000000246 R12: 000055e795d03400 [ 224.740195] R13: 00007fffd1f50500 R14: 0000000000000000 R15: 0000000000000000 [ 224.740205] irq event stamp: 1582591 [ 224.740207] hardirqs last enabled at (1582590): [] vprintk_emit+0x4bc/0x4d0 [ 224.740210] hardirqs last disabled at (1582591): [] error_entry+0x7c/0x100 [ 224.740212] softirqs last enabled at (1582568): [] __do_softirq+0x34f/0x505 [ 224.740215] softirqs last disabled at (1582571): [] irq_exit+0xa9/0xc0 [ 224.740218] WARNING: CPU: 3 PID: 2982 at net/sched/sch_generic.c:461 dev_watchdog+0x1fd/0x210 [ 224.740219] ---[ end trace 6e41d690e611c338 ]--- References: https://gitlab.freedesktop.org/drm/intel/-/issues/8037 References: https://bugzilla.kernel.org/show_bug.cgi?id=196399 Acked-by: Martin Peres Cc: Martin Peres Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20170718082110.12524-1-daniel.vetter@ffwll.ch Signed-off-by: Rodrigo Vivi Signed-off-by: Joonas Lahtinen Signed-off-by: Rodrigo Vivi --- net/sched/sch_generic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 1e008a228ebd..e44e449127d2 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -531,7 +531,12 @@ static void dev_watchdog(struct timer_list *t) oldest_start = trans_start; } - if (unlikely(timedout_ms)) { + /* The noise is pissing off our CI and upstream doesn't + * move on the bug report: + * + * https://bugzilla.kernel.org/show_bug.cgi?id=196399 + */ + if (unlikely(timedout_ms) && 0) { trace_net_dev_xmit_timeout(dev, i); netdev_crit(dev, "NETDEV WATCHDOG: CPU: %d: transmit queue %u timed out %u ms\n", raw_smp_processor_id(), -- cgit v1.2.3 From e6578387283fd7d4102d2b55192699dac285b477 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 1 Jul 2019 15:29:03 +0100 Subject: mm: Show slab debug as offsets from section base not hashed pointers Since the kernel now used hashed pointers for raw addresses, it is very hard to guage the relative placement within a section, and since the hash value will never match up with any contents, using it provides no information relevant for slab debugging. Show the relative offset into each section, so that some reference for the hexdump is provided. References: https://gitlab.freedesktop.org/drm/intel/-/issues/8038 Signed-off-by: Chris Wilson Signed-off-by: Rodrigo Vivi --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..966b659260e2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -946,7 +946,7 @@ static void print_section(char *level, char *text, u8 *addr, unsigned int length) { metadata_access_enable(); - print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, + print_hex_dump(level, text, DUMP_PREFIX_OFFSET, 16, 1, kasan_reset_tag((void *)addr), length, 1); metadata_access_disable(); } -- cgit v1.2.3 From 2421f2ded0c3c0cbcfafec8c176ea8679957fdd4 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 23 Apr 2020 09:27:53 +0100 Subject: pci/msi: Stop warning for MSI enabling failure If the MSI is already enabled, trying to enable it again results in an -EINVAL and on the first attempt a WARN. That WARN causes our CI to abort the run [on each first attempt to suspend]: <4> [463.142025] WARNING: CPU: 0 PID: 2225 at drivers/pci/msi.c:1074 __pci_enable_msi_range+0x3cb/0x420 <4> [463.142026] Modules linked in: snd_hda_intel i915 snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic mei_hdcp x86_pkg_temp_thermal coretemp crct10dif_pclmul crc32_pclmul snd_intel_dspcfg ghash_clmulni_intel snd_hda_codec btusb btrtl btbcm btintel e1000e bluetooth snd_hwdep snd_hda_core ptp ecdh_generic snd_pcm ecc pps_core mei_me mei prime_numbers [last unloaded: i915] <4> [463.142045] CPU: 0 PID: 2225 Comm: kworker/u8:14 Tainted: G U 5.7.0-rc2-CI-CI_DRM_8350+ #1 <4> [463.142046] Hardware name: Intel Corporation NUC7i5BNH/NUC7i5BNB, BIOS BNKBL357.86A.0060.2017.1214.2013 12/14/2017 <4> [463.142049] Workqueue: events_unbound async_run_entry_fn <4> [463.142051] RIP: 0010:__pci_enable_msi_range+0x3cb/0x420 <4> [463.142053] Code: 76 58 49 8d 56 48 48 89 df e8 31 73 fd ff e9 20 fe ff ff 31 f6 48 89 df e8 c2 e9 fd ff e9 d6 fe ff ff 45 89 fc e9 1a ff ff ff <0f> 0b 41 bc ea ff ff ff e9 0d ff ff ff 41 bc ea ff ff ff e9 02 ff <4> [463.142054] RSP: 0018:ffffc90000593cd0 EFLAGS: 00010202 <4> [463.142056] RAX: 0000000000000010 RBX: ffff888274051000 RCX: 0000000000000000 <4> [463.142057] RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff888274051000 <4> [463.142058] RBP: ffff888238aa1018 R08: 0000000000000001 R09: 0000000000000001 <4> [463.142060] R10: ffffc90000593d90 R11: 00000000c79cdfd5 R12: ffff8882740510b0 <4> [463.142061] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000001 <4> [463.142062] FS: 0000000000000000(0000) GS:ffff888276c00000(0000) knlGS:0000000000000000 <4> [463.142064] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 <4> [463.142065] CR2: 000055706f347d80 CR3: 0000000005610003 CR4: 00000000003606f0 <4> [463.142066] Call Trace: <4> [463.142073] pci_enable_msi+0x11/0x20 <4> [463.142077] azx_resume+0x1ab/0x200 [snd_hda_intel] <4> [463.142080] ? pci_pm_thaw+0x80/0x80 <4> [463.142084] dpm_run_callback+0x64/0x280 <4> [463.142089] device_resume+0xd4/0x1c0 <4> [463.142093] ? dpm_watchdog_set+0x60/0 While this would appear to be a bug in snd-hda, it does appear inconsequential, at least for gfx-ci. Downgrade the warning to an info, like the other already-enabled error for MSI-X. References: https://gitlab.freedesktop.org/drm/intel/-/issues/8041 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1687 Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20200423082753.3899018-1-chris@chris-wilson.co.uk Signed-off-by: Rodrigo Vivi --- drivers/pci/msi/msi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c index 34d664139f48..0030f9c89f9c 100644 --- a/drivers/pci/msi/msi.c +++ b/drivers/pci/msi/msi.c @@ -426,8 +426,10 @@ int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec, if (maxvec < minvec) return -ERANGE; - if (WARN_ON_ONCE(dev->msi_enabled)) + if (dev->msi_enabled) { + pci_info(dev, "can't enable MSI, already enabled\n"); return -EINVAL; + } /* Test for the availability of MSI support */ if (!pci_msi_domain_supports(dev, 0, ALLOW_LEGACY)) -- cgit v1.2.3 From 8dddd55fe537f30f98d51fb0f498c844ada6a7d5 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 17 Dec 2020 16:47:00 +0000 Subject: HAX net/phy: Suppress WARN for calling stop while halted References: https://gitlab.freedesktop.org/drm/intel/-/issues/8046 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2805 Signed-off-by: Rodrigo Vivi --- drivers/net/phy/phy.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 13df28445f02..75654b5a8858 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1626,11 +1626,8 @@ void phy_stop(struct phy_device *phydev) enum phy_state old_state; if (!phy_is_started(phydev) && phydev->state != PHY_DOWN && - phydev->state != PHY_ERROR) { - WARN(1, "called from state %s\n", - phy_state_to_str(phydev->state)); + phydev->state != PHY_ERROR) return; - } mutex_lock(&phydev->lock); old_state = phydev->state; -- cgit v1.2.3 From 8e7f774a30e25c90536ac4ab965ba76138badffa Mon Sep 17 00:00:00 2001 From: Joonas Lahtinen Date: Mon, 8 May 2023 13:09:13 +0300 Subject: HAX net/phy: Suppress WARN from phy_error References: https://gitlab.freedesktop.org/drm/intel/-/issues/8047 Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2874 Signed-off-by: Rodrigo Vivi Signed-off-by: Joonas Lahtinen Signed-off-by: Rodrigo Vivi --- drivers/net/phy/phy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 75654b5a8858..2e2fcb83141b 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1361,7 +1361,7 @@ static void phy_error_precise(struct phy_device *phydev, */ void phy_error(struct phy_device *phydev) { - WARN_ON(1); + pr_notice_once("%s\n", __func__); phy_process_error(phydev); } EXPORT_SYMBOL(phy_error); -- cgit v1.2.3 From 6b3c68c7f314d958f1ffbc9ec5fc4ad62873bb13 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 4 Jun 2024 19:16:18 +0300 Subject: thunderbolt: Add Kconfig option to disable PCIe tunneling In typical cases PCIe tunneling is needed to make the devices fully usable for the host system. However, it poses a security issue because they can also use DMA to access the host memory. We already have two ways of preventing this, one an IOMMU that is enabled on recent systems by default and the second is the "authorized" attribute under each connected device that needs to be written by userspace before a PCIe tunnel is created. This option adds one more by adding a Kconfig option, which is enabled by default, that can be used to make kernel binaries where PCIe tunneling is completely disabled. Signed-off-by: Mika Westerberg References: https://intel-gfx-ci.01.org/tree/drm-tip/Trybot_134314v1/bat-mtlp-9/boot0.txt References: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/11261 Signed-off-by: Imre Deak Acked-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20240604161618.1958674-1-imre.deak@intel.com Signed-off-by: Rodrigo Vivi --- drivers/thunderbolt/Kconfig | 18 ++++++++++++++++++ drivers/thunderbolt/tb.c | 2 +- drivers/thunderbolt/tb.h | 9 +++++++++ drivers/thunderbolt/tunnel.c | 8 ++++---- drivers/thunderbolt/usb4.c | 2 +- 5 files changed, 33 insertions(+), 6 deletions(-) diff --git a/drivers/thunderbolt/Kconfig b/drivers/thunderbolt/Kconfig index 0abdb69ee9f4..8bf4ecf7f76e 100644 --- a/drivers/thunderbolt/Kconfig +++ b/drivers/thunderbolt/Kconfig @@ -18,6 +18,24 @@ menuconfig USB4 if USB4 +config USB4_PCIE_TUNNELING + bool "Allow PCI Express tunneling over USB4 fabric" + depends on PCI + default y + help + USB4 and Thunderbolt devices typically include PCIe switch + with a number of PCIe endpoints such as USB host controllers, + GPUs and network adapters. These are made available to the + host system through PCIe tunneling. These can use DMA and + therefore have access to the host memory which is typically + guarded by an IOMMU. This option allows disabling PCIe + tunneling completely. + + For devices to be usable it is recommended to say Y here. + + Note this only works with systems that use Software Based + Connection Manager (this is most USB4 hosts). + config USB4_DEBUGFS_WRITE bool "Enable write by debugfs to configuration spaces (DANGEROUS)" help diff --git a/drivers/thunderbolt/tb.c b/drivers/thunderbolt/tb.c index c14ab1fbeeaf..0514a673471a 100644 --- a/drivers/thunderbolt/tb.c +++ b/drivers/thunderbolt/tb.c @@ -3364,7 +3364,7 @@ struct tb *tb_probe(struct tb_nhi *nhi) if (!tb) return NULL; - if (tb_acpi_may_tunnel_pcie()) + if (tb_may_tunnel_pcie()) tb->security_level = TB_SECURITY_USER; else tb->security_level = TB_SECURITY_NOPCIE; diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h index f503bad86413..7d5c673412f4 100644 --- a/drivers/thunderbolt/tb.h +++ b/drivers/thunderbolt/tb.h @@ -1518,6 +1518,15 @@ static inline int tb_acpi_power_on_retimers(struct tb_port *port) { return 0; } static inline int tb_acpi_power_off_retimers(struct tb_port *port) { return 0; } #endif +static inline bool tb_may_tunnel_pcie(void) +{ +#ifdef CONFIG_USB4_PCIE_TUNNELING + return tb_acpi_may_tunnel_pcie(); +#else + return false; +#endif +} + #ifdef CONFIG_DEBUG_FS void tb_debugfs_init(void); void tb_debugfs_exit(void); diff --git a/drivers/thunderbolt/tunnel.c b/drivers/thunderbolt/tunnel.c index d52efe3f658c..6b85da2eea60 100644 --- a/drivers/thunderbolt/tunnel.c +++ b/drivers/thunderbolt/tunnel.c @@ -130,7 +130,7 @@ static unsigned int tb_available_credits(const struct tb_port *port, size_t ndp; usb3 = tb_acpi_may_tunnel_usb3() ? sw->max_usb3_credits : 0; - pcie = tb_acpi_may_tunnel_pcie() ? sw->max_pcie_credits : 0; + pcie = tb_may_tunnel_pcie() ? sw->max_pcie_credits : 0; if (tb_acpi_is_xdomain_allowed()) { spare = min_not_zero(sw->max_dma_credits, dma_credits); @@ -553,7 +553,7 @@ bool tb_tunnel_reserved_pci(struct tb_port *port, int *reserved_up, if (WARN_ON_ONCE(!port->remote)) return false; - if (!tb_acpi_may_tunnel_pcie()) + if (!tb_may_tunnel_pcie()) return false; if (tb_port_get_link_generation(port) < 4) @@ -1720,7 +1720,7 @@ static unsigned int tb_dma_available_credits(const struct tb_port *port) int credits; credits = tb_available_credits(port, NULL); - if (tb_acpi_may_tunnel_pcie()) + if (tb_may_tunnel_pcie()) credits -= sw->max_pcie_credits; credits -= port->dma_credits; @@ -2031,7 +2031,7 @@ static int tb_usb3_consumed_bandwidth(struct tb_tunnel *tunnel, int *consumed_up, int *consumed_down) { struct tb_port *port = tb_upstream_port(tunnel->dst_port->sw); - int pcie_weight = tb_acpi_may_tunnel_pcie() ? TB_PCI_WEIGHT : 0; + int pcie_weight = tb_may_tunnel_pcie() ? TB_PCI_WEIGHT : 0; /* * PCIe tunneling, if enabled, affects the USB3 bandwidth so diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c index fdae76c8f728..b3d22873e837 100644 --- a/drivers/thunderbolt/usb4.c +++ b/drivers/thunderbolt/usb4.c @@ -276,7 +276,7 @@ int usb4_switch_setup(struct tb_switch *sw) * Only enable PCIe tunneling if the parent router supports it * and it is not disabled. */ - if (tb_acpi_may_tunnel_pcie() && + if (tb_may_tunnel_pcie() && tb_switch_find_port(parent, TB_TYPE_PCIE_DOWN)) { val |= ROUTER_CS_5_PTO; /* -- cgit v1.2.3 From e9b01b51c8625afc2f18bf3a91db2b8f2c68a751 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Mon, 3 Feb 2025 15:31:13 +0200 Subject: Revert "lockdep: Enable PROVE_RAW_LOCK_NESTING with PROVE_LOCKING." This reverts commit 560af5dc839eef08a273908f390cfefefb82aa04. Locking in i915_pmu.c interacting with perf is completely wrong. It's using spinlock_t everywhere when it should actually use raw_spinlock_t since perf is already holding raw_spinlock in the caller. This started to be checked with commit 560af5dc839e ("lockdep: Enable PROVE_RAW_LOCK_NESTING with PROVE_LOCKING."), but should only be a real issue when PREEMPT_RT is enabled: in that config, the spinlock_t can sleep and creates issue. Reworking the locks in i915_pmu.c is not very simple as changing locks to raw_spinlock_t cascades to too many locks, which is both a) not desired from an RT perspective and b) hard to get right as it calls into other parts of the driver that have other requirements. Example backtrace: <4> [141.043897] ============================= <4> [141.043922] [ BUG: Invalid wait context ] <4> [141.043940] 6.13.0-rc2-CI_DRM_15820-g78bd7a249aa0+ #1 Not tainted <4> [141.043964] ----------------------------- <4> [141.043981] swapper/0/0 is trying to lock: <4> [141.044000] ffff88810861b910 (&pmu->lock){....}-{3:3}, at: i915_pmu_enable+0x48/0x3a0 [i915] <4> [141.044194] other info that might help us debug this: <4> [141.044217] context-{5:5} <4> [141.044229] 1 lock held by swapper/0/0: <4> [141.044248] #0: ffff88885f432038 (&cpuctx_lock){....}-{2:2}, at: __perf_install_in_context+0x3f/0x360 <4> [141.044297] stack backtrace: <4> [141.044312] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.13.0-rc2-CI_DRM_15820-g78bd7a249aa0+ #1 <4> [141.044353] Hardware name: Intel Corporation Meteor Lake Client Platform/MTL-P LP5x T3 RVP, BIOS MTLPFWI1.R00.3471.D91.2401310918 01/31/2024 <4> [141.044405] Call Trace: <4> [141.044419] <4> [141.044431] dump_stack_lvl+0x91/0xf0 <4> [141.044454] dump_stack+0x10/0x20 <4> [141.044472] __lock_acquire+0x990/0x2820 <4> [141.044498] lock_acquire+0xc9/0x300 <4> [141.044518] ? i915_pmu_enable+0x48/0x3a0 [i915] <4> [141.044689] _raw_spin_lock_irqsave+0x49/0x80 <4> [141.044713] ? i915_pmu_enable+0x48/0x3a0 [i915] <4> [141.044903] i915_pmu_enable+0x48/0x3a0 [i915] <4> [141.045112] ? __lock_acquire+0x455/0x2820 <4> [141.045142] i915_pmu_event_add+0x71/0x90 [i915] More time is needed to get this fixed properly, but let's not pile regressions on top. Signed-off-by: Luca Coelho Link: https://patchwork.freedesktop.org/patch/msgid/20241211121703.2890150-1-luciano.coelho@intel.com [ Reword commit message, giving more detail on what the issue is ] Signed-off-by: Lucas De Marchi References: https://gitlab.freedesktop.org/drm/i915/kernel/-/issues/13311 Acked-by: Rodrigo Vivi --- lib/Kconfig.debug | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index dc0e0c6ed075..9c189668e6f6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1420,12 +1420,20 @@ config PROVE_LOCKING config PROVE_RAW_LOCK_NESTING bool "Enable raw_spinlock - spinlock nesting checks" if !ARCH_SUPPORTS_RT depends on PROVE_LOCKING - default y if ARCH_SUPPORTS_RT + default n help Enable the raw_spinlock vs. spinlock nesting checks which ensure that the lock nesting rules for PREEMPT_RT enabled kernels are not violated. + NOTE: There are known nesting problems. So if you enable this + option expect lockdep splats until these problems have been fully + addressed which is work in progress. This config switch allows to + identify and analyze these problems. It will be removed and the + check permanently enabled once the main issues have been fixed. + + If unsure, select N. + config LOCK_STAT bool "Lock usage statistics" depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT -- cgit v1.2.3