From c32d7cab57e3a77af8ecc17cde7a5761a26483b8 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Wed, 24 Aug 2022 12:12:21 -0700 Subject: [PATCH 01/13] x86/fpu: Configure init_fpstate attributes orderly The init_fpstate setup code is spread out and out of order. The init image is recorded before its scoped features and the buffer size are determined. Determine the scope of init_fpstate components and its size before recording the init state. Also move the relevant code together. Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Acked-by: neelnatu@google.com Link: https://lore.kernel.org/r/20220824191223.1248-2-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/init.c | 8 -------- arch/x86/kernel/fpu/xstate.c | 6 +++++- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 621f4b6cac4a..8946f89761cc 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -210,13 +210,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpstate_reset(¤t->thread.fpu); } -static void __init fpu__init_init_fpstate(void) -{ - /* Bring init_fpstate size and features up to date */ - init_fpstate.size = fpu_kernel_cfg.max_size; - init_fpstate.xfeatures = fpu_kernel_cfg.max_features; -} - /* * Called on the boot CPU once per system bootup, to set up the initial * FPU state that is later cloned into all processes: @@ -236,5 +229,4 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(fpu_kernel_cfg.max_size); fpu__init_task_struct_size(); - fpu__init_init_fpstate(); } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index c8340156bfd2..f0ce10620ab0 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -360,7 +360,7 @@ static void __init setup_init_fpu_buf(void) print_xstate_features(); - xstate_init_xcomp_bv(&init_fpstate.regs.xsave, fpu_kernel_cfg.max_features); + xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 @@ -875,6 +875,10 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); + /* Bring init_fpstate size and features up to date */ + init_fpstate.size = fpu_kernel_cfg.max_size; + init_fpstate.xfeatures = fpu_kernel_cfg.max_features; + setup_init_fpu_buf(); /* From d3e021adac7c51a26d9ede167c789fcc1b878467 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Wed, 24 Aug 2022 12:12:22 -0700 Subject: [PATCH 02/13] x86/fpu: Fix the init_fpstate size check with the actual size The init_fpstate buffer is statically allocated. Thus, the sanity test was established to check whether the pre-allocated buffer is enough for the calculated size or not. The currently measured size is not strictly relevant. Fix to validate the calculated init_fpstate size with the pre-allocated area. Also, replace the sanity check function with open code for clarity. The abstraction itself and the function naming do not tend to represent simply what it does. Fixes: 2ae996e0c1a3 ("x86/fpu: Calculate the default sizes independently") Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220824191223.1248-3-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f0ce10620ab0..f5ef78633b4c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -678,20 +678,6 @@ static unsigned int __init get_xsave_size_user(void) return ebx; } -/* - * Will the runtime-enumerated 'xstate_size' fit in the init - * task's statically-allocated buffer? - */ -static bool __init is_supported_xstate_size(unsigned int test_xstate_size) -{ - if (test_xstate_size <= sizeof(init_fpstate.regs)) - return true; - - pr_warn("x86/fpu: xstate buffer too small (%zu < %d), disabling xsave\n", - sizeof(init_fpstate.regs), test_xstate_size); - return false; -} - static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ @@ -717,10 +703,6 @@ static int __init init_xstate_size(void) kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); - /* Ensure we have the space to store all default enabled features. */ - if (!is_supported_xstate_size(kernel_default_size)) - return -EINVAL; - if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; @@ -879,6 +861,12 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) init_fpstate.size = fpu_kernel_cfg.max_size; init_fpstate.xfeatures = fpu_kernel_cfg.max_features; + if (init_fpstate.size > sizeof(init_fpstate.regs)) { + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + sizeof(init_fpstate.regs), init_fpstate.size); + goto out_disable; + } + setup_init_fpu_buf(); /* From a401f45e38754953c9d402f8b3bc965707eecc91 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Wed, 24 Aug 2022 12:12:23 -0700 Subject: [PATCH 03/13] x86/fpu: Exclude dynamic states from init_fpstate == Background == The XSTATE init code initializes all enabled and supported components. Then, the init states are saved in the init_fpstate buffer that is statically allocated in about one page. The AMX TILE_DATA state is large (8KB) but its init state is zero. And the feature comes only with the compacted format with these established dependencies: AMX->XFD->XSAVES. So this state is excludable from init_fpstate. == Problem == But the buffer is formatted to include that large state. Then, this can be the cause of a noisy splat like the below. This came from XRSTORS for the task with init_fpstate in its XSAVE buffer. It is reproducible on AMX systems when the running kernel is built with CONFIG_DEBUG_PAGEALLOC=y and CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT=y: Bad FPU state detected at restore_fpregs_from_fpstate+0x57/0xd0, reinitializing FPU registers. ... RIP: 0010:restore_fpregs_from_fpstate+0x57/0xd0 ? restore_fpregs_from_fpstate+0x45/0xd0 switch_fpu_return+0x4e/0xe0 exit_to_user_mode_prepare+0x17b/0x1b0 syscall_exit_to_user_mode+0x29/0x40 do_syscall_64+0x67/0x80 ? do_syscall_64+0x67/0x80 ? exc_page_fault+0x86/0x180 entry_SYSCALL_64_after_hwframe+0x63/0xcd == Solution == Adjust init_fpstate to exclude dynamic states. XRSTORS from init_fpstate still initializes those states when their bits are set in the requested-feature bitmap. Fixes: 2308ee57d93d ("x86/fpu/amx: Enable the AMX feature in 64-bit mode") Reported-by: Lin X Wang Signed-off-by: Chang S. Bae Signed-off-by: Thomas Gleixner Tested-by: Lin X Wang Link: https://lore.kernel.org/r/20220824191223.1248-4-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index f5ef78633b4c..e77cabfa802f 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -857,9 +857,12 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); - /* Bring init_fpstate size and features up to date */ - init_fpstate.size = fpu_kernel_cfg.max_size; - init_fpstate.xfeatures = fpu_kernel_cfg.max_features; + /* + * init_fpstate excludes dynamic states as they are large but init + * state is zero. + */ + init_fpstate.size = fpu_kernel_cfg.default_size; + init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", From 33806e7cb8d50379f55c3e8f335e91e1b359dc7b Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 29 Sep 2022 08:20:10 -0700 Subject: [PATCH 04/13] x86/Kconfig: Drop check for -mabi=ms for CONFIG_EFI_STUB A recent change in LLVM made CONFIG_EFI_STUB unselectable because it no longer pretends to support -mabi=ms, breaking the dependency in Kconfig. Lack of CONFIG_EFI_STUB can prevent kernels from booting via EFI in certain circumstances. This check was added by 8f24f8c2fc82 ("efi/libstub: Annotate firmware routines as __efiapi") to ensure that __attribute__((ms_abi)) was available, as -mabi=ms is not actually used in any cflags. According to the GCC documentation, this attribute has been supported since GCC 4.4.7. The kernel currently requires GCC 5.1 so this check is not necessary; even when that change landed in 5.6, the kernel required GCC 4.9 so it was unnecessary then as well. Clang supports __attribute__((ms_abi)) for all versions that are supported for building the kernel so no additional check is needed. Remove the 'depends on' line altogether to allow CONFIG_EFI_STUB to be selected when CONFIG_EFI is enabled, regardless of compiler. Fixes: 8f24f8c2fc82 ("efi/libstub: Annotate firmware routines as __efiapi") Signed-off-by: Nathan Chancellor Signed-off-by: Borislav Petkov Reviewed-by: Nick Desaulniers Acked-by: Ard Biesheuvel Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/commit/d1ad006a8f64bdc17f618deffa9e7c91d82c444d --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6d1879ef933a..67745ceab0db 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1973,7 +1973,6 @@ config EFI config EFI_STUB bool "EFI stub support" depends on EFI - depends on $(cc-option,-mabi=ms) || X86_32 select RELOCATABLE help This kernel feature allows a bzImage to be loaded directly From 7108b80a542b9d65e44b36d64a700a83658c0b73 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 14 Oct 2022 17:01:45 +0800 Subject: [PATCH 05/13] hwmon/coretemp: Handle large core ID value The coretemp driver supports up to a hard-coded limit of 128 cores. Today, the driver can not support a core with an ID above that limit. Yet, the encoding of core ID's is arbitrary (BIOS APIC-ID) and so they may be sparse and they may be large. Update the driver to map arbitrary core ID numbers into appropriate array indexes so that 128 cores can be supported, no matter the encoding of core ID's. Signed-off-by: Zhang Rui Signed-off-by: Dave Hansen Acked-by: Len Brown Acked-by: Guenter Roeck Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20221014090147.1836-3-rui.zhang@intel.com --- drivers/hwmon/coretemp.c | 56 +++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index ccf0af5b988a..8bf32c6c85d9 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -46,9 +46,6 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius"); #define TOTAL_ATTRS (MAX_CORE_ATTRS + 1) #define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO) -#define TO_CORE_ID(cpu) (cpu_data(cpu).cpu_core_id) -#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) - #ifdef CONFIG_SMP #define for_each_sibling(i, cpu) \ for_each_cpu(i, topology_sibling_cpumask(cpu)) @@ -91,6 +88,8 @@ struct temp_data { struct platform_data { struct device *hwmon_dev; u16 pkg_id; + u16 cpu_map[NUM_REAL_CORES]; + struct ida ida; struct cpumask cpumask; struct temp_data *core_data[MAX_CORE_DATA]; struct device_attribute name_attr; @@ -441,7 +440,7 @@ static struct temp_data *init_temp_data(unsigned int cpu, int pkg_flag) MSR_IA32_THERM_STATUS; tdata->is_pkg_data = pkg_flag; tdata->cpu = cpu; - tdata->cpu_core_id = TO_CORE_ID(cpu); + tdata->cpu_core_id = topology_core_id(cpu); tdata->attr_size = MAX_CORE_ATTRS; mutex_init(&tdata->update_lock); return tdata; @@ -454,7 +453,7 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, struct platform_data *pdata = platform_get_drvdata(pdev); struct cpuinfo_x86 *c = &cpu_data(cpu); u32 eax, edx; - int err, attr_no; + int err, index, attr_no; /* * Find attr number for sysfs: @@ -462,14 +461,26 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, * The attr number is always core id + 2 * The Pkgtemp will always show up as temp1_*, if available */ - attr_no = pkg_flag ? PKG_SYSFS_ATTR_NO : TO_ATTR_NO(cpu); + if (pkg_flag) { + attr_no = PKG_SYSFS_ATTR_NO; + } else { + index = ida_alloc(&pdata->ida, GFP_KERNEL); + if (index < 0) + return index; + pdata->cpu_map[index] = topology_core_id(cpu); + attr_no = index + BASE_SYSFS_ATTR_NO; + } - if (attr_no > MAX_CORE_DATA - 1) - return -ERANGE; + if (attr_no > MAX_CORE_DATA - 1) { + err = -ERANGE; + goto ida_free; + } tdata = init_temp_data(cpu, pkg_flag); - if (!tdata) - return -ENOMEM; + if (!tdata) { + err = -ENOMEM; + goto ida_free; + } /* Test if we can access the status register */ err = rdmsr_safe_on_cpu(cpu, tdata->status_reg, &eax, &edx); @@ -505,6 +516,9 @@ static int create_core_data(struct platform_device *pdev, unsigned int cpu, exit_free: pdata->core_data[attr_no] = NULL; kfree(tdata); +ida_free: + if (!pkg_flag) + ida_free(&pdata->ida, index); return err; } @@ -524,6 +538,9 @@ static void coretemp_remove_core(struct platform_data *pdata, int indx) kfree(pdata->core_data[indx]); pdata->core_data[indx] = NULL; + + if (indx >= BASE_SYSFS_ATTR_NO) + ida_free(&pdata->ida, indx - BASE_SYSFS_ATTR_NO); } static int coretemp_probe(struct platform_device *pdev) @@ -537,6 +554,7 @@ static int coretemp_probe(struct platform_device *pdev) return -ENOMEM; pdata->pkg_id = pdev->id; + ida_init(&pdata->ida); platform_set_drvdata(pdev, pdata); pdata->hwmon_dev = devm_hwmon_device_register_with_groups(dev, DRVNAME, @@ -553,6 +571,7 @@ static int coretemp_remove(struct platform_device *pdev) if (pdata->core_data[i]) coretemp_remove_core(pdata, i); + ida_destroy(&pdata->ida); return 0; } @@ -647,7 +666,7 @@ static int coretemp_cpu_offline(unsigned int cpu) struct platform_device *pdev = coretemp_get_pdev(cpu); struct platform_data *pd; struct temp_data *tdata; - int indx, target; + int i, indx = -1, target; /* * Don't execute this on suspend as the device remove locks @@ -660,12 +679,19 @@ static int coretemp_cpu_offline(unsigned int cpu) if (!pdev) return 0; - /* The core id is too big, just return */ - indx = TO_ATTR_NO(cpu); - if (indx > MAX_CORE_DATA - 1) + pd = platform_get_drvdata(pdev); + + for (i = 0; i < NUM_REAL_CORES; i++) { + if (pd->cpu_map[i] == topology_core_id(cpu)) { + indx = i + BASE_SYSFS_ATTR_NO; + break; + } + } + + /* Too many cores and this core is not populated, just return */ + if (indx < 0) return 0; - pd = platform_get_drvdata(pdev); tdata = pd->core_data[indx]; cpumask_clear_cpu(cpu, &pd->cpumask); From 2b12a7a126d62bdbd81f4923c21bf6e9a7fbd069 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 14 Oct 2022 17:01:46 +0800 Subject: [PATCH 06/13] x86/topology: Fix multiple packages shown on a single-package system CPUID.1F/B does not enumerate Package level explicitly, instead, all the APIC-ID bits above the enumerated levels are assumed to be package ID bits. Current code gets package ID by shifting out all the APIC-ID bits that Linux supports, rather than shifting out all the APIC-ID bits that CPUID.1F enumerates. This introduces problems when CPUID.1F enumerates a level that Linux does not support. For example, on a single package AlderLake-N, there are 2 Ecore Modules with 4 atom cores in each module. Linux does not support the Module level and interprets the Module ID bits as package ID and erroneously reports a multi module system as a multi-package system. Fix this by using APIC-ID bits above all the CPUID.1F enumerated levels as package ID. [ dhansen: spelling fix ] Fixes: 7745f03eb395 ("x86/topology: Add CPUID.1F multi-die/package support") Suggested-by: Len Brown Signed-off-by: Zhang Rui Signed-off-by: Dave Hansen Reviewed-by: Len Brown Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20221014090147.1836-4-rui.zhang@intel.com --- arch/x86/kernel/cpu/topology.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 132a2de44d2f..f7592814e5d5 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; unsigned int die_select_mask, die_level_siblings; + unsigned int pkg_mask_width; bool die_level_present = false; int leaf; @@ -111,10 +112,10 @@ int detect_extended_topology(struct cpuinfo_x86 *c) core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); - die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + pkg_mask_width = die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; - do { + while (true) { cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); /* @@ -132,8 +133,13 @@ int detect_extended_topology(struct cpuinfo_x86 *c) die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } + if (LEAFB_SUBTYPE(ecx) != INVALID_TYPE) + pkg_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + else + break; + sub_index++; - } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + } core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; die_select_mask = (~(-1 << die_plus_mask_width)) >> @@ -148,7 +154,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) } c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, - die_plus_mask_width); + pkg_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ From 71eac7063698b7d7b8fafb1683ac24a034541141 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 14 Oct 2022 17:01:47 +0800 Subject: [PATCH 07/13] x86/topology: Fix duplicated core ID within a package Today, core ID is assumed to be unique within each package. But an AlderLake-N platform adds a Module level between core and package, Linux excludes the unknown modules bits from the core ID, resulting in duplicate core ID's. To keep core ID unique within a package, Linux must include all APIC-ID bits for known or unknown levels above the core and below the package in the core ID. It is important to understand that core ID's have always come directly from the APIC-ID encoding, which comes from the BIOS. Thus there is no guarantee that they start at 0, or that they are contiguous. As such, naively using them for array indexes can be problematic. [ dhansen: un-known -> unknown ] Fixes: 7745f03eb395 ("x86/topology: Add CPUID.1F multi-die/package support") Suggested-by: Len Brown Signed-off-by: Zhang Rui Signed-off-by: Dave Hansen Reviewed-by: Len Brown Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20221014090147.1836-5-rui.zhang@intel.com --- arch/x86/kernel/cpu/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index f7592814e5d5..5e868b62a7c4 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -141,7 +141,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c) sub_index++; } - core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + core_select_mask = (~(-1 << pkg_mask_width)) >> ht_mask_width; die_select_mask = (~(-1 << die_plus_mask_width)) >> core_plus_mask_width; From e7ad18d1169c62e6c78c01ff693fd362d9d65278 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 5 Oct 2022 12:00:08 +0200 Subject: [PATCH 08/13] x86/microcode/AMD: Apply the patch early on every logical thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the patch application logic checks whether the revision needs to be applied on each logical CPU (SMT thread). Therefore, on SMT designs where the microcode engine is shared between the two threads, the application happens only on one of them as that is enough to update the shared microcode engine. However, there are microcode patches which do per-thread modification, see Link tag below. Therefore, drop the revision check and try applying on each thread. This is what the BIOS does too so this method is very much tested. Btw, change only the early paths. On the late loading paths, there's no point in doing per-thread modification because if is it some case like in the bugzilla below - removing a CPUID flag - the kernel cannot go and un-use features it has detected are there early. For that, one should use early loading anyway. [ bp: Fixes does not contain the oldest commit which did check for equality but that is good enough. ] Fixes: 8801b3fcb574 ("x86/microcode/AMD: Rework container parsing") Reported-by: Ștefan Talpalaru Signed-off-by: Borislav Petkov Tested-by: Ștefan Talpalaru Cc: Link: https://bugzilla.kernel.org/show_bug.cgi?id=216211 --- arch/x86/kernel/cpu/microcode/amd.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index e7410e98fc1f..3a35dec3ec55 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -440,7 +440,13 @@ apply_microcode_early_amd(u32 cpuid_1_eax, void *ucode, size_t size, bool save_p return ret; native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev >= mc->hdr.patch_id) + + /* + * Allow application of the same revision to pick up SMT-specific + * changes even if the revision of the other SMT thread is already + * up-to-date. + */ + if (rev > mc->hdr.patch_id) return ret; if (!__apply_microcode_amd(mc)) { @@ -528,8 +534,12 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax) native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - /* Check whether we have saved a new patch already: */ - if (*new_rev && rev < mc->hdr.patch_id) { + /* + * Check whether a new patch has been saved already. Also, allow application of + * the same revision in order to pick up SMT-thread-specific configuration even + * if the sibling SMT thread already has an up-to-date revision. + */ + if (*new_rev && rev <= mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { *new_rev = mc->hdr.patch_id; return; From 67bf6493449b09590f9f71d7df29efb392b12d25 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Tue, 27 Sep 2022 15:16:29 -0500 Subject: [PATCH 09/13] x86/resctrl: Fix min_cbm_bits for AMD AMD systems support zero CBM (capacity bit mask) for cache allocation. That is reflected in rdt_init_res_defs_amd() by: r->cache.arch_has_empty_bitmaps = true; However given the unified code in cbm_validate(), checking for: val == 0 && !arch_has_empty_bitmaps is not enough because of another check in cbm_validate(): if ((zero_bit - first_bit) < r->cache.min_cbm_bits) The default value of r->cache.min_cbm_bits = 1. Leading to: $ cd /sys/fs/resctrl $ mkdir foo $ cd foo $ echo L3:0=0 > schemata -bash: echo: write error: Invalid argument $ cat /sys/fs/resctrl/info/last_cmd_status Need at least 1 bits in the mask Initialize the min_cbm_bits to 0 for AMD. Also, remove the default setting of min_cbm_bits and initialize it separately. After the fix: $ cd /sys/fs/resctrl $ mkdir foo $ cd foo $ echo L3:0=0 > schemata $ cat /sys/fs/resctrl/info/last_cmd_status ok Fixes: 316e7f901f5a ("x86/resctrl: Add struct rdt_cache::arch_has_{sparse, empty}_bitmaps") Co-developed-by: Stephane Eranian Signed-off-by: Stephane Eranian Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov Reviewed-by: Ingo Molnar Reviewed-by: James Morse Reviewed-by: Reinette Chatre Reviewed-by: Fenghua Yu Cc: Link: https://lore.kernel.org/lkml/20220517001234.3137157-1-eranian@google.com --- arch/x86/kernel/cpu/resctrl/core.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index de62b0b87ced..3266ea36667c 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -66,9 +66,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L3, .name = "L3", .cache_level = 3, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L3), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -83,9 +80,6 @@ struct rdt_hw_resource rdt_resources_all[] = { .rid = RDT_RESOURCE_L2, .name = "L2", .cache_level = 2, - .cache = { - .min_cbm_bits = 1, - }, .domains = domain_init(RDT_RESOURCE_L2), .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", @@ -836,6 +830,7 @@ static __init void rdt_init_res_defs_intel(void) r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_empty_bitmaps = false; r->cache.arch_has_per_cpu_cfg = false; + r->cache.min_cbm_bits = 1; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_THRTL_BASE; hw_res->msr_update = mba_wrmsr_intel; @@ -856,6 +851,7 @@ static __init void rdt_init_res_defs_amd(void) r->cache.arch_has_sparse_bitmaps = true; r->cache.arch_has_empty_bitmaps = true; r->cache.arch_has_per_cpu_cfg = true; + r->cache.min_cbm_bits = 0; } else if (r->rid == RDT_RESOURCE_MBA) { hw_res->msr_base = MSR_IA32_MBA_BW_BASE; hw_res->msr_update = mba_wrmsr_amd; From b5f1fc3184405ab955db1b86d41d8b744d07c12d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Sep 2022 13:11:35 +0200 Subject: [PATCH 10/13] x86/ftrace: Remove ftrace_epilogue() Remove the weird jumps to RET and simply use RET. This then promotes ftrace_stub() to a real function; which becomes important for kcfi. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111148.719080593@infradead.org Signed-off-by: Peter Zijlstra --- arch/x86/kernel/ftrace_64.S | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index dfeb227de561..a90c55a6b481 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -172,20 +172,14 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) -SYM_FUNC_START(ftrace_epilogue) -/* - * This is weak to keep gas from relaxing the jumps. - */ -SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) +SYM_FUNC_START(ftrace_stub) UNWIND_HINT_FUNC - ENDBR RET -SYM_FUNC_END(ftrace_epilogue) +SYM_FUNC_END(ftrace_stub) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ @@ -262,14 +256,11 @@ SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) popfq /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. + * The trampoline will add the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR - jmp ftrace_epilogue + RET /* Swap the flags with orig_rax */ 1: movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -280,7 +271,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) /* Restore flags */ popfq UNWIND_HINT_FUNC - jmp ftrace_epilogue + RET SYM_FUNC_END(ftrace_regs_caller) STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) From 883bbbffa5a4ffd1915f8b42934dab81b7f87226 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Oct 2022 13:49:21 +0200 Subject: [PATCH 11/13] ftrace,kcfi: Separate ftrace_stub() and ftrace_stub_graph() Different function signatures means they needs to be different functions; otherwise CFI gets upset. As triggered by the ftrace boot tests: [] CFI failure at ftrace_return_to_handler+0xac/0x16c (target: ftrace_stub+0x0/0x14; expected type: 0x0a5d5347) Fixes: 3c516f89e17e ("x86: Add support for CONFIG_CFI_CLANG") Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Tested-by: Mark Rutland Link: https://lkml.kernel.org/r/Y06dg4e1xF6JTdQq@hirez.programming.kicks-ass.net --- arch/arm64/kernel/entry-ftrace.S | 7 ++++++- arch/x86/kernel/ftrace_64.S | 17 +++++++++-------- include/asm-generic/vmlinux.lds.h | 18 ++++++++++++------ 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index bd5df50e4643..795344ab4ec4 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -7,6 +7,7 @@ */ #include +#include #include #include #include @@ -294,10 +295,14 @@ SYM_FUNC_END(ftrace_graph_caller) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */ -SYM_FUNC_START(ftrace_stub) +SYM_TYPED_FUNC_START(ftrace_stub) ret SYM_FUNC_END(ftrace_stub) +SYM_TYPED_FUNC_START(ftrace_stub_graph) + ret +SYM_FUNC_END(ftrace_stub_graph) + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * void return_to_handler(void) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index a90c55a6b481..2a4be92fd144 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -4,6 +4,7 @@ */ #include +#include #include #include #include @@ -129,6 +130,14 @@ .endm +SYM_TYPED_FUNC_START(ftrace_stub) + RET +SYM_FUNC_END(ftrace_stub) + +SYM_TYPED_FUNC_START(ftrace_stub_graph) + RET +SYM_FUNC_END(ftrace_stub_graph) + #ifdef CONFIG_DYNAMIC_FTRACE SYM_FUNC_START(__fentry__) @@ -176,11 +185,6 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) SYM_FUNC_END(ftrace_caller); STACK_FRAME_NON_STANDARD_FP(ftrace_caller) -SYM_FUNC_START(ftrace_stub) - UNWIND_HINT_FUNC - RET -SYM_FUNC_END(ftrace_stub) - SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq @@ -282,9 +286,6 @@ STACK_FRAME_NON_STANDARD_FP(ftrace_regs_caller) SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace - -SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) - ENDBR RET trace: diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c15de165ec8f..d06ada2341cb 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -162,6 +162,16 @@ #define PATCHABLE_DISCARDS *(__patchable_function_entries) #endif +#ifndef CONFIG_ARCH_SUPPORTS_CFI_CLANG +/* + * Simply points to ftrace_stub, but with the proper protocol. + * Defined by the linker script in linux/vmlinux.lds.h + */ +#define FTRACE_STUB_HACK ftrace_stub_graph = ftrace_stub; +#else +#define FTRACE_STUB_HACK +#endif + #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* * The ftrace call sites are logged to a section whose name depends on the @@ -169,10 +179,6 @@ * FTRACE_CALLSITE_SECTION. We capture all of them here to avoid header * dependencies for FTRACE_CALLSITE_SECTION's definition. * - * Need to also make ftrace_stub_graph point to ftrace_stub - * so that the same stub location may have different protocols - * and not mess up with C verifiers. - * * ftrace_ops_list_func will be defined as arch_ftrace_ops_list_func * as some archs will have a different prototype for that function * but ftrace_ops_list_func() will have a single prototype. @@ -182,11 +188,11 @@ KEEP(*(__mcount_loc)) \ KEEP_PATCHABLE \ __stop_mcount_loc = .; \ - ftrace_stub_graph = ftrace_stub; \ + FTRACE_STUB_HACK \ ftrace_ops_list_func = arch_ftrace_ops_list_func; #else # ifdef CONFIG_FUNCTION_TRACER -# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; \ +# define MCOUNT_REC() FTRACE_STUB_HACK \ ftrace_ops_list_func = arch_ftrace_ops_list_func; # else # define MCOUNT_REC() From b329f5ddc9ce4b622d9c7aaf5c6df4de52caf91a Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 18 Jul 2022 17:11:19 +0300 Subject: [PATCH 12/13] perf/x86/intel/lbr: Use setup_clear_cpu_cap() instead of clear_cpu_cap() clear_cpu_cap(&boot_cpu_data) is very similar to setup_clear_cpu_cap() except that the latter also sets a bit in 'cpu_caps_cleared' which later clears the same cap in secondary cpus, which is likely what is meant here. Fixes: 47125db27e47 ("perf/x86/intel/lbr: Support Architectural LBR") Signed-off-by: Maxim Levitsky Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lkml.kernel.org/r/20220718141123.136106-2-mlevitsk@redhat.com --- arch/x86/events/intel/lbr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4fce1a4226e3..8259d725054d 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -1596,7 +1596,7 @@ void __init intel_pmu_arch_lbr_init(void) return; clear_arch_lbr: - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR); + setup_clear_cpu_cap(X86_FEATURE_ARCH_LBR); } /** From 471f0aa7fa64e23766a1473b32d9ec3f0718895a Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Fri, 21 Oct 2022 11:58:44 -0700 Subject: [PATCH 13/13] x86/fpu: Fix copy_xstate_to_uabi() to copy init states correctly When an extended state component is not present in fpstate, but in init state, the function copies from init_fpstate via copy_feature(). But, dynamic states are not present in init_fpstate because of all-zeros init states. Then retrieving them from init_fpstate will explode like this: BUG: kernel NULL pointer dereference, address: 0000000000000000 ... RIP: 0010:memcpy_erms+0x6/0x10 ? __copy_xstate_to_uabi_buf+0x381/0x870 fpu_copy_guest_fpstate_to_uabi+0x28/0x80 kvm_arch_vcpu_ioctl+0x14c/0x1460 [kvm] ? __this_cpu_preempt_check+0x13/0x20 ? vmx_vcpu_put+0x2e/0x260 [kvm_intel] kvm_vcpu_ioctl+0xea/0x6b0 [kvm] ? kvm_vcpu_ioctl+0xea/0x6b0 [kvm] ? __fget_light+0xd4/0x130 __x64_sys_ioctl+0xe3/0x910 ? debug_smp_processor_id+0x17/0x20 ? fpregs_assert_state_consistent+0x27/0x50 do_syscall_64+0x3f/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Adjust the 'mask' to zero out the userspace buffer for the features that are not available both from fpstate and from init_fpstate. The dynamic features depend on the compacted XSAVE format. Ensure it is enabled before reading XCOMP_BV in init_fpstate. Fixes: 2308ee57d93d ("x86/fpu/amx: Enable the AMX feature in 64-bit mode") Reported-by: Yuan Yao Suggested-by: Dave Hansen Signed-off-by: Chang S. Bae Signed-off-by: Dave Hansen Tested-by: Yuan Yao Link: https://lore.kernel.org/lkml/BYAPR11MB3717EDEF2351C958F2C86EED95259@BYAPR11MB3717.namprd11.prod.outlook.com/ Link: https://lkml.kernel.org/r/20221021185844.13472-1-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e77cabfa802f..59e543b95a3c 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1125,6 +1125,15 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = fpstate->user_xfeatures; + /* + * Dynamic features are not present in init_fpstate. When they are + * in an all zeros init state, remove those from 'mask' to zero + * those features in the user buffer instead of retrieving them + * from init_fpstate. + */ + if (fpu_state_size_dynamic()) + mask &= (header.xfeatures | xinit->header.xcomp_bv); + for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space