aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild4
-rw-r--r--arch/x86/Kconfig42
-rw-r--r--arch/x86/Makefile17
-rw-r--r--arch/x86/boot/code16gcc.h24
-rw-r--r--arch/x86/boot/compressed/Makefile3
-rw-r--r--arch/x86/boot/compressed/eboot.c48
-rw-r--r--arch/x86/boot/compressed/eboot.h16
-rw-r--r--arch/x86/boot/header.S28
-rw-r--r--arch/x86/boot/tools/build.c38
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S546
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c40
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S281
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S805
-rw-r--r--arch/x86/crypto/des3_ede_glue.c509
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c2
-rw-r--r--arch/x86/include/asm/Kbuild3
-rw-r--r--arch/x86/include/asm/acenv.h4
-rw-r--r--arch/x86/include/asm/acpi.h5
-rw-r--r--arch/x86/include/asm/alternative.h14
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/barrier.h2
-rw-r--r--arch/x86/include/asm/cmpxchg.h4
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h409
-rw-r--r--arch/x86/include/asm/crash.h9
-rw-r--r--arch/x86/include/asm/efi.h33
-rw-r--r--arch/x86/include/asm/fpu-internal.h11
-rw-r--r--arch/x86/include/asm/ftrace.h2
-rw-r--r--arch/x86/include/asm/irq.h2
-rw-r--r--arch/x86/include/asm/irqflags.h2
-rw-r--r--arch/x86/include/asm/kexec-bzimage64.h6
-rw-r--r--arch/x86/include/asm/kexec.h45
-rw-r--r--arch/x86/include/asm/kvm_emulate.h33
-rw-r--r--arch/x86/include/asm/kvm_host.h19
-rw-r--r--arch/x86/include/asm/mc146818rtc.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h6
-rw-r--r--arch/x86/include/asm/mutex_32.h16
-rw-r--r--arch/x86/include/asm/mwait.h2
-rw-r--r--arch/x86/include/asm/page.h1
-rw-r--r--arch/x86/include/asm/page_64.h2
-rw-r--r--arch/x86/include/asm/percpu.h3
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h78
-rw-r--r--arch/x86/include/asm/pmc_atom.h107
-rw-r--r--arch/x86/include/asm/processor.h7
-rw-r--r--arch/x86/include/asm/ptrace.h16
-rw-r--r--arch/x86/include/asm/qrwlock.h2
-rw-r--r--arch/x86/include/asm/scatterlist.h8
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h19
-rw-r--r--arch/x86/include/asm/vdso.h18
-rw-r--r--arch/x86/include/asm/vga.h6
-rw-r--r--arch/x86/include/asm/vmx.h7
-rw-r--r--arch/x86/include/asm/xsave.h223
-rw-r--r--arch/x86/include/uapi/asm/Kbuild1
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/include/uapi/asm/kvm_perf.h16
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h3
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/apei.c62
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c18
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c348
-rw-r--r--arch/x86/kernel/cpu/common.c30
-rw-r--r--arch/x86/kernel/cpu/intel.c52
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c18
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh51
-rw-r--r--arch/x86/kernel/cpu/perf_event.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event.h12
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c111
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c78
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c16
-rw-r--r--arch/x86/kernel/cpu/proc.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/crash.c563
-rw-r--r--arch/x86/kernel/entry_32.S22
-rw-r--r--arch/x86/kernel/entry_64.S80
-rw-r--r--arch/x86/kernel/espfix_64.c5
-rw-r--r--arch/x86/kernel/ftrace.c3
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c553
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/machine_kexec_64.c239
-rw-r--r--arch/x86/kernel/mcount_64.S13
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/pmc_atom.c321
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/resource.c8
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/tsc.c32
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c23
-rw-r--r--arch/x86/kernel/xsave.c118
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c494
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/lapic.c56
-rw-r--r--arch/x86/kvm/mmu_audit.c2
-rw-r--r--arch/x86/kvm/mmutrace.h4
-rw-r--r--arch/x86/kvm/pmu.c9
-rw-r--r--arch/x86/kvm/svm.c58
-rw-r--r--arch/x86/kvm/trace.h6
-rw-r--r--arch/x86/kvm/vmx.c241
-rw-r--r--arch/x86/kvm/x86.c249
-rw-r--r--arch/x86/kvm/x86.h27
-rw-r--r--arch/x86/mm/fault.c9
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/tlb.c103
-rw-r--r--arch/x86/net/bpf_jit_comp.c16
-rw-r--r--arch/x86/pci/fixup.c21
-rw-r--r--arch/x86/pci/i386.c4
-rw-r--r--arch/x86/platform/efi/Makefile2
-rw-r--r--arch/x86/platform/efi/efi.c483
-rw-r--r--arch/x86/platform/efi/quirks.c290
-rw-r--r--arch/x86/platform/ts5500/ts5500.c94
-rw-r--r--arch/x86/platform/uv/tlb_uv.c71
-rw-r--r--arch/x86/power/cpu.c4
-rw-r--r--arch/x86/purgatory/Makefile30
-rw-r--r--arch/x86/purgatory/entry64.S101
-rw-r--r--arch/x86/purgatory/purgatory.c72
-rw-r--r--arch/x86/purgatory/setup-x86_64.S58
-rw-r--r--arch/x86/purgatory/sha256.c283
-rw-r--r--arch/x86/purgatory/sha256.h22
-rw-r--r--arch/x86/purgatory/stack.S19
-rw-r--r--arch/x86/purgatory/string.c13
-rw-r--r--arch/x86/syscalls/syscall_32.tbl3
-rw-r--r--arch/x86/syscalls/syscall_64.tbl4
-rw-r--r--arch/x86/um/asm/elf.h1
-rw-r--r--arch/x86/um/asm/processor.h3
-rw-r--r--arch/x86/um/mem_64.c15
-rw-r--r--arch/x86/um/signal.c45
-rw-r--r--arch/x86/vdso/Makefile34
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
-rw-r--r--arch/x86/vdso/vdso-fakesections.c32
-rw-r--r--arch/x86/vdso/vdso-layout.lds.S102
-rw-r--r--arch/x86/vdso/vdso.lds.S2
-rw-r--r--arch/x86/vdso/vdso2c.c195
-rw-r--r--arch/x86/vdso/vdso2c.h105
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32/vdso-fakesections.c1
-rw-r--r--arch/x86/vdso/vdsox32.lds.S2
-rw-r--r--arch/x86/vdso/vma.c26
-rw-r--r--arch/x86/xen/Makefile1
-rw-r--r--arch/x86/xen/efi.c43
-rw-r--r--arch/x86/xen/enlighten.c15
-rw-r--r--arch/x86/xen/grant-table.c106
-rw-r--r--arch/x86/xen/p2m.c5
-rw-r--r--arch/x86/xen/xen-ops.h8
159 files changed, 7714 insertions, 2376 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index e5287d8517a..61b6d51866f 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -16,3 +16,7 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/
obj-y += platform/
obj-y += net/
+
+ifeq ($(CONFIG_X86_64),y)
+obj-$(CONFIG_KEXEC) += purgatory/
+endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 147a7b78810..5d0bf1aa9dc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,6 +21,7 @@ config X86_64
### Arch settings
config X86
def_bool y
+ select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@ -54,7 +55,6 @@ config X86
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_GRAPH_FP_TEST
- select HAVE_FUNCTION_TRACE_MCOUNT_TEST
select HAVE_SYSCALL_TRACEPOINTS
select SYSCTL_EXCEPTION_TRACE
select HAVE_KVM
@@ -96,6 +96,7 @@ config X86
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
@@ -109,9 +110,9 @@ config X86
select CLOCKSOURCE_WATCHDOG
select GENERIC_CLOCKEVENTS
select ARCH_CLOCKSOURCE_DATA
+ select CLOCKSOURCE_VALIDATE_LAST_CYCLE
select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
select GENERIC_TIME_VSYSCALL
- select KTIME_SCALAR if X86_32
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HAVE_CONTEXT_TRACKING if X86_64
@@ -131,6 +132,10 @@ config X86
select HAVE_CC_STACKPROTECTOR
select GENERIC_CPU_AUTOPROBE
select HAVE_ARCH_AUDITSYSCALL
+ select ARCH_SUPPORTS_ATOMIC_RMW
+ select HAVE_ACPI_APEI if ACPI
+ select HAVE_ACPI_APEI_NMI if ACPI
+ select ACPI_LEGACY_TABLES_LOOKUP if ACPI
config INSTRUCTION_DECODER
def_bool y
@@ -1524,6 +1529,7 @@ config EFI
bool "EFI runtime service support"
depends on ACPI
select UCS2_STRING
+ select EFI_RUNTIME_WRAPPERS
---help---
This enables the kernel to use EFI runtime services that are
available (such as the EFI variable services).
@@ -1537,7 +1543,8 @@ config EFI
config EFI_STUB
bool "EFI stub support"
- depends on EFI
+ depends on EFI && !X86_USE_3DNOW
+ select RELOCATABLE
---help---
This kernel feature allows a bzImage to be loaded directly
by EFI firmware without the use of a bootloader.
@@ -1578,6 +1585,9 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
+ select BUILD_BIN2C
+ select CRYPTO
+ select CRYPTO_SHA256
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -1592,6 +1602,28 @@ config KEXEC
interface is strongly in flux, so no good recommendation can be
made.
+config KEXEC_VERIFY_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on KEXEC
+ ---help---
+ This option makes kernel signature verification mandatory for
+ kexec_file_load() syscall. If kernel is signature can not be
+ verified, kexec_file_load() will fail.
+
+ This option enforces signature verification at generic level.
+ One needs to enable signature verification for type of kernel
+ image being loaded to make sure it works. For example, enable
+ bzImage signature verification option to be able to load and
+ verify signatures of bzImage. Otherwise kernel loading will fail.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on KEXEC_VERIFY_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ ---help---
+ Enable bzImage signature verification support.
+
config CRASH_DUMP
bool "kernel crash dumps"
depends on X86_64 || (X86_32 && HIGHMEM)
@@ -2405,6 +2437,10 @@ config IOSF_MBI
default m
depends on PCI
+config PMC_ATOM
+ def_bool y
+ depends on PCI
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 33f71b01fd2..c1aa3688784 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -15,12 +15,9 @@ endif
# that way we can complain to the user if the CPU is insufficient.
#
# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
-# older versions of GCC, we need to play evil and unreliable tricks to
-# attempt to ensure that our asm(".code16gcc") is first in the asm
-# output.
-CODE16GCC_CFLAGS := -m32 -include $(srctree)/arch/x86/boot/code16gcc.h \
- $(call cc-option, -fno-toplevel-reorder,\
- $(call cc-option, -fno-unit-at-a-time))
+# older versions of GCC, include an *assembly* header to make sure that
+# gcc doesn't play any games behind our back.
+CODE16GCC_CFLAGS := -m32 -Wa,$(srctree)/arch/x86/boot/code16gcc.h
M16_CFLAGS := $(call cc-option, -m16, $(CODE16GCC_CFLAGS))
REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \
@@ -186,6 +183,14 @@ archscripts: scripts_basic
archheaders:
$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+archprepare:
+ifeq ($(CONFIG_KEXEC),y)
+# Build only for 64bit. No loaders for 32bit yet.
+ ifeq ($(CONFIG_X86_64),y)
+ $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c
+ endif
+endif
+
###
# Kernel objects
diff --git a/arch/x86/boot/code16gcc.h b/arch/x86/boot/code16gcc.h
index d93e48010b6..5ff42653539 100644
--- a/arch/x86/boot/code16gcc.h
+++ b/arch/x86/boot/code16gcc.h
@@ -1,15 +1,11 @@
-/*
- * code16gcc.h
- *
- * This file is -include'd when compiling 16-bit C code.
- * Note: this asm() needs to be emitted before gcc emits any code.
- * Depending on gcc version, this requires -fno-unit-at-a-time or
- * -fno-toplevel-reorder.
- *
- * Hopefully gcc will eventually have a real -m16 option so we can
- * drop this hack long term.
- */
+#
+# code16gcc.h
+#
+# This file is added to the assembler via -Wa when compiling 16-bit C code.
+# This is done this way instead via asm() to make sure gcc does not reorder
+# things around us.
+#
+# gcc 4.9+ has a real -m16 option so we can drop this hack long term.
+#
-#ifndef __ASSEMBLY__
-asm(".code16gcc");
-#endif
+ .code16gcc
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0fcd9133790..7a801a310e3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -33,7 +33,8 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
ifeq ($(CONFIG_EFI_STUB), y)
- VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
+ VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
+ $(objtree)/drivers/firmware/efi/libstub/lib.a
endif
$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 0331d765c2b..f277184e2ac 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -19,10 +19,7 @@
static efi_system_table_t *sys_table;
-static struct efi_config *efi_early;
-
-#define efi_call_early(f, ...) \
- efi_early->call(efi_early->f, __VA_ARGS__);
+struct efi_config *efi_early;
#define BOOT_SERVICES(bits) \
static void setup_boot_services##bits(struct efi_config *c) \
@@ -48,8 +45,7 @@ static void setup_boot_services##bits(struct efi_config *c) \
BOOT_SERVICES(32);
BOOT_SERVICES(64);
-static void efi_printk(efi_system_table_t *, char *);
-static void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
+void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
static efi_status_t
__file_size32(void *__fh, efi_char16_t *filename_16,
@@ -156,7 +152,7 @@ grow:
return status;
}
-static efi_status_t
+efi_status_t
efi_file_size(efi_system_table_t *sys_table, void *__fh,
efi_char16_t *filename_16, void **handle, u64 *file_sz)
{
@@ -166,7 +162,7 @@ efi_file_size(efi_system_table_t *sys_table, void *__fh,
return __file_size32(__fh, filename_16, handle, file_sz);
}
-static inline efi_status_t
+efi_status_t
efi_file_read(void *handle, unsigned long *size, void *addr)
{
unsigned long func;
@@ -184,7 +180,7 @@ efi_file_read(void *handle, unsigned long *size, void *addr)
}
}
-static inline efi_status_t efi_file_close(void *handle)
+efi_status_t efi_file_close(void *handle)
{
if (efi_early->is64) {
efi_file_handle_64_t *fh = handle;
@@ -249,7 +245,7 @@ static inline efi_status_t __open_volume64(void *__image, void **__fh)
return status;
}
-static inline efi_status_t
+efi_status_t
efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
{
if (efi_early->is64)
@@ -258,7 +254,7 @@ efi_open_volume(efi_system_table_t *sys_table, void *__image, void **__fh)
return __open_volume32(__image, __fh);
}
-static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
+void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
{
unsigned long output_string;
size_t offset;
@@ -284,8 +280,6 @@ static void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
}
}
-#include "../../../../drivers/firmware/efi/efi-stub-helper.c"
-
static void find_bits(unsigned long mask, u8 *pos, u8 *size)
{
u8 first, len;
@@ -1038,6 +1032,7 @@ struct boot_params *make_boot_params(struct efi_config *c)
int i;
unsigned long ramdisk_addr;
unsigned long ramdisk_size;
+ unsigned long initrd_addr_max;
efi_early = c;
sys_table = (efi_system_table_t *)(unsigned long)efi_early->table;
@@ -1100,14 +1095,21 @@ struct boot_params *make_boot_params(struct efi_config *c)
memset(sdt, 0, sizeof(*sdt));
+ if (hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)
+ initrd_addr_max = -1UL;
+ else
+ initrd_addr_max = hdr->initrd_addr_max;
+
status = handle_cmdline_files(sys_table, image,
(char *)(unsigned long)hdr->cmd_line_ptr,
- "initrd=", hdr->initrd_addr_max,
+ "initrd=", initrd_addr_max,
&ramdisk_addr, &ramdisk_size);
if (status != EFI_SUCCESS)
goto fail2;
- hdr->ramdisk_image = ramdisk_addr;
- hdr->ramdisk_size = ramdisk_size;
+ hdr->ramdisk_image = ramdisk_addr & 0xffffffff;
+ hdr->ramdisk_size = ramdisk_size & 0xffffffff;
+ boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32;
+ boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32;
return boot_params;
fail2:
@@ -1374,7 +1376,10 @@ struct boot_params *efi_main(struct efi_config *c,
setup_graphics(boot_params);
- setup_efi_pci(boot_params);
+ status = setup_efi_pci(boot_params);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "setup_efi_pci() failed!\n");
+ }
status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
sizeof(*gdt), (void **)&gdt);
@@ -1401,16 +1406,20 @@ struct boot_params *efi_main(struct efi_config *c,
hdr->init_size, hdr->init_size,
hdr->pref_address,
hdr->kernel_alignment);
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "efi_relocate_kernel() failed!\n");
goto fail;
+ }
hdr->pref_address = hdr->code32_start;
hdr->code32_start = bzimage_addr;
}
status = exit_boot(boot_params, handle, is64);
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "exit_boot() failed!\n");
goto fail;
+ }
memset((char *)gdt->address, 0x0, gdt->size);
desc = (struct desc_struct *)gdt->address;
@@ -1470,5 +1479,6 @@ struct boot_params *efi_main(struct efi_config *c,
return boot_params;
fail:
+ efi_printk(sys_table, "efi_main() failed!\n");
return NULL;
}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index c88c31ecad1..d487e727f1e 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -103,20 +103,4 @@ struct efi_uga_draw_protocol {
void *blt;
};
-struct efi_config {
- u64 image_handle;
- u64 table;
- u64 allocate_pool;
- u64 allocate_pages;
- u64 get_memory_map;
- u64 free_pool;
- u64 free_pages;
- u64 locate_handle;
- u64 handle_protocol;
- u64 exit_boot_services;
- u64 text_output;
- efi_status_t (*call)(unsigned long, ...);
- bool is64;
-} __packed;
-
#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 84c223479e3..16ef02596db 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -91,10 +91,9 @@ bs_die:
.section ".bsdata", "a"
bugger_off_msg:
- .ascii "Direct floppy boot is not supported. "
- .ascii "Use a boot loader program instead.\r\n"
+ .ascii "Use a boot loader.\r\n"
.ascii "\n"
- .ascii "Remove disk and press any key to reboot ...\r\n"
+ .ascii "Remove disk and press any key to reboot...\r\n"
.byte 0
#ifdef CONFIG_EFI_STUB
@@ -108,7 +107,7 @@ coff_header:
#else
.word 0x8664 # x86-64
#endif
- .word 3 # nr_sections
+ .word 4 # nr_sections
.long 0 # TimeDateStamp
.long 0 # PointerToSymbolTable
.long 1 # NumberOfSymbols
@@ -155,7 +154,7 @@ extra_header_fields:
#else
.quad 0 # ImageBase
#endif
- .long 0x20 # SectionAlignment
+ .long CONFIG_PHYSICAL_ALIGN # SectionAlignment
.long 0x20 # FileAlignment
.word 0 # MajorOperatingSystemVersion
.word 0 # MinorOperatingSystemVersion
@@ -250,6 +249,25 @@ section_table:
.word 0 # NumberOfLineNumbers
.long 0x60500020 # Characteristics (section flags)
+ #
+ # The offset & size fields are filled in by build.c.
+ #
+ .ascii ".bss"
+ .byte 0
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0xc8000080 # Characteristics (section flags)
+
#endif /* CONFIG_EFI_STUB */
# Kernel attributes; used by setup. This is part 1 of the
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index 1a2f2121cad..a7661c430cd 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -143,7 +143,7 @@ static void usage(void)
#ifdef CONFIG_EFI_STUB
-static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset)
{
unsigned int pe_header;
unsigned short num_sections;
@@ -164,10 +164,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
put_unaligned_le32(size, section + 0x8);
/* section header vma field */
- put_unaligned_le32(offset, section + 0xc);
+ put_unaligned_le32(vma, section + 0xc);
/* section header 'size of initialised data' field */
- put_unaligned_le32(size, section + 0x10);
+ put_unaligned_le32(datasz, section + 0x10);
/* section header 'file offset' field */
put_unaligned_le32(offset, section + 0x14);
@@ -179,6 +179,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz
}
}
+static void update_pecoff_section_header(char *section_name, u32 offset, u32 size)
+{
+ update_pecoff_section_header_fields(section_name, offset, size, size, offset);
+}
+
static void update_pecoff_setup_and_reloc(unsigned int size)
{
u32 setup_offset = 0x200;
@@ -203,9 +208,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
pe_header = get_unaligned_le32(&buf[0x3c]);
- /* Size of image */
- put_unaligned_le32(file_sz, &buf[pe_header + 0x50]);
-
/*
* Size of code: Subtract the size of the first sector (512 bytes)
* which includes the header.
@@ -220,6 +222,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz)
update_pecoff_section_header(".text", text_start, text_sz);
}
+static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz)
+{
+ unsigned int pe_header;
+ unsigned int bss_sz = init_sz - file_sz;
+
+ pe_header = get_unaligned_le32(&buf[0x3c]);
+
+ /* Size of uninitialized data */
+ put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]);
+
+ /* Size of image */
+ put_unaligned_le32(init_sz, &buf[pe_header + 0x50]);
+
+ update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0);
+}
+
static int reserve_pecoff_reloc_section(int c)
{
/* Reserve 0x20 bytes for .reloc section */
@@ -259,6 +277,8 @@ static void efi_stub_entry_update(void)
static inline void update_pecoff_setup_and_reloc(unsigned int size) {}
static inline void update_pecoff_text(unsigned int text_start,
unsigned int file_sz) {}
+static inline void update_pecoff_bss(unsigned int file_sz,
+ unsigned int init_sz) {}
static inline void efi_stub_defaults(void) {}
static inline void efi_stub_entry_update(void) {}
@@ -310,7 +330,7 @@ static void parse_zoffset(char *fname)
int main(int argc, char ** argv)
{
- unsigned int i, sz, setup_sectors;
+ unsigned int i, sz, setup_sectors, init_sz;
int c;
u32 sys_size;
struct stat sb;
@@ -376,7 +396,9 @@ int main(int argc, char ** argv)
buf[0x1f1] = setup_sectors-1;
put_unaligned_le32(sys_size, &buf[0x1f4]);
- update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz));
+ update_pecoff_text(setup_sectors * 512, i + (sys_size * 16));
+ init_sz = get_unaligned_le32(&buf[0x260]);
+ update_pecoff_bss(i + (sys_size * 16), init_sz);
efi_stub_entry_update();
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 61d6e281898..d551165a315 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_DES3_EDE_X86_64) += des3_ede-x86_64.o
obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
@@ -52,6 +53,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
+des3_ede-x86_64-y := des3_ede-asm_64.o des3_ede_glue.o
camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
@@ -76,7 +78,7 @@ ifeq ($(avx2_supported),yes)
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
ifeq ($(avx2_supported),yes)
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
new file mode 100644
index 00000000000..f091f122ed2
--- /dev/null
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -0,0 +1,546 @@
+/*
+ * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
+ *
+ * This is AES128/192/256 CTR mode optimization implementation. It requires
+ * the support of Intel(R) AESNI and AVX instructions.
+ *
+ * This work was inspired by the AES CTR mode optimization published
+ * in Intel Optimized IPSEC Cryptograhpic library.
+ * Additional information on it can be found at:
+ * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Sean Gulley <sean.m.gulley@intel.com>
+ * Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/inst.h>
+
+#define CONCAT(a,b) a##b
+#define VMOVDQ vmovdqu
+
+#define xdata0 %xmm0
+#define xdata1 %xmm1
+#define xdata2 %xmm2
+#define xdata3 %xmm3
+#define xdata4 %xmm4
+#define xdata5 %xmm5
+#define xdata6 %xmm6
+#define xdata7 %xmm7
+#define xcounter %xmm8
+#define xbyteswap %xmm9
+#define xkey0 %xmm10
+#define xkey3 %xmm11
+#define xkey6 %xmm12
+#define xkey9 %xmm13
+#define xkey4 %xmm11
+#define xkey8 %xmm12
+#define xkey12 %xmm13
+#define xkeyA %xmm14
+#define xkeyB %xmm15
+
+#define p_in %rdi
+#define p_iv %rsi
+#define p_keys %rdx
+#define p_out %rcx
+#define num_bytes %r8
+
+#define tmp %r10
+#define DDQ(i) CONCAT(ddq_add_,i)
+#define XMM(i) CONCAT(%xmm, i)
+#define DDQ_DATA 0
+#define XDATA 1
+#define KEY_128 1
+#define KEY_192 2
+#define KEY_256 3
+
+.section .rodata
+.align 16
+
+byteswap_const:
+ .octa 0x000102030405060708090A0B0C0D0E0F
+ddq_add_1:
+ .octa 0x00000000000000000000000000000001
+ddq_add_2:
+ .octa 0x00000000000000000000000000000002
+ddq_add_3:
+ .octa 0x00000000000000000000000000000003
+ddq_add_4:
+ .octa 0x00000000000000000000000000000004
+ddq_add_5:
+ .octa 0x00000000000000000000000000000005
+ddq_add_6:
+ .octa 0x00000000000000000000000000000006
+ddq_add_7:
+ .octa 0x00000000000000000000000000000007
+ddq_add_8:
+ .octa 0x00000000000000000000000000000008
+
+.text
+
+/* generate a unique variable for ddq_add_x */
+
+.macro setddq n
+ var_ddq_add = DDQ(\n)
+.endm
+
+/* generate a unique variable for xmm register */
+.macro setxdata n
+ var_xdata = XMM(\n)
+.endm
+
+/* club the numeric 'id' to the symbol 'name' */
+
+.macro club name, id
+.altmacro
+ .if \name == DDQ_DATA
+ setddq %\id
+ .elseif \name == XDATA
+ setxdata %\id
+ .endif
+.noaltmacro
+.endm
+
+/*
+ * do_aes num_in_par load_keys key_len
+ * This increments p_in, but not p_out
+ */
+.macro do_aes b, k, key_len
+ .set by, \b
+ .set load_keys, \k
+ .set klen, \key_len
+
+ .if (load_keys)
+ vmovdqa 0*16(p_keys), xkey0
+ .endif
+
+ vpshufb xbyteswap, xcounter, xdata0
+
+ .set i, 1
+ .rept (by - 1)
+ club DDQ_DATA, i
+ club XDATA, i
+ vpaddd var_ddq_add(%rip), xcounter, var_xdata
+ vpshufb xbyteswap, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 1*16(p_keys), xkeyA
+
+ vpxor xkey0, xdata0, xdata0
+ club DDQ_DATA, by
+ vpaddd var_ddq_add(%rip), xcounter, xcounter
+
+ .set i, 1
+ .rept (by - 1)
+ club XDATA, i
+ vpxor xkey0, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 2*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 3*16(p_keys), xkeyA
+ .endif
+ .else
+ vmovdqa 3*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
+ .set i, (i +1)
+ .endr
+
+ add $(16*by), p_in
+
+ .if (klen == KEY_128)
+ vmovdqa 4*16(p_keys), xkey4
+ .else
+ .if (load_keys)
+ vmovdqa 4*16(p_keys), xkey4
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 5*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkey4, var_xdata, var_xdata /* key 4 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 6*16(p_keys), xkeyB
+ .endif
+ .else
+ vmovdqa 6*16(p_keys), xkeyB
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 7*16(p_keys), xkeyA
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ vmovdqa 8*16(p_keys), xkey8
+ .else
+ .if (load_keys)
+ vmovdqa 8*16(p_keys), xkey8
+ .endif
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_128)
+ .if (load_keys)
+ vmovdqa 9*16(p_keys), xkeyA
+ .endif
+ .else
+ vmovdqa 9*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkey8, var_xdata, var_xdata /* key 8 */
+ .set i, (i +1)
+ .endr
+
+ vmovdqa 10*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ vmovdqa 11*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 10 */
+ .if (klen == KEY_128)
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkeyB, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen != KEY_128)
+ .if (load_keys)
+ vmovdqa 12*16(p_keys), xkey12
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 13*16(p_keys), xkeyA
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ .if (klen == KEY_256)
+ /* key 12 */
+ vaesenc xkey12, var_xdata, var_xdata
+ .else
+ vaesenclast xkey12, var_xdata, var_xdata
+ .endif
+ .set i, (i +1)
+ .endr
+
+ .if (klen == KEY_256)
+ vmovdqa 14*16(p_keys), xkeyB
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 13 */
+ vaesenc xkeyA, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ /* key 14 */
+ vaesenclast xkeyB, var_xdata, var_xdata
+ .set i, (i +1)
+ .endr
+ .endif
+ .endif
+
+ .set i, 0
+ .rept (by / 2)
+ .set j, (i+1)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ VMOVDQ (j*16 - 16*by)(p_in), xkeyB
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ club XDATA, j
+ vpxor xkeyB, var_xdata, var_xdata
+ .set i, (i+2)
+ .endr
+
+ .if (i < by)
+ VMOVDQ (i*16 - 16*by)(p_in), xkeyA
+ club XDATA, i
+ vpxor xkeyA, var_xdata, var_xdata
+ .endif
+
+ .set i, 0
+ .rept by
+ club XDATA, i
+ VMOVDQ var_xdata, i*16(p_out)
+ .set i, (i+1)
+ .endr
+.endm
+
+.macro do_aes_load val, key_len
+ do_aes \val, 1, \key_len
+.endm
+
+.macro do_aes_noload val, key_len
+ do_aes \val, 0, \key_len
+.endm
+
+/* main body of aes ctr load */
+
+.macro do_aes_ctrmain key_len
+
+ cmp $16, num_bytes
+ jb .Ldo_return2\key_len
+
+ vmovdqa byteswap_const(%rip), xbyteswap
+ vmovdqu (p_iv), xcounter
+ vpshufb xbyteswap, xcounter, xcounter
+
+ mov num_bytes, tmp
+ and $(7*16), tmp
+ jz .Lmult_of_8_blks\key_len
+
+ /* 1 <= tmp <= 7 */
+ cmp $(4*16), tmp
+ jg .Lgt4\key_len
+ je .Leq4\key_len
+
+.Llt4\key_len:
+ cmp $(2*16), tmp
+ jg .Leq3\key_len
+ je .Leq2\key_len
+
+.Leq1\key_len:
+ do_aes_load 1, \key_len
+ add $(1*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq2\key_len:
+ do_aes_load 2, \key_len
+ add $(2*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+
+.Leq3\key_len:
+ do_aes_load 3, \key_len
+ add $(3*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq4\key_len:
+ do_aes_load 4, \key_len
+ add $(4*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lgt4\key_len:
+ cmp $(6*16), tmp
+ jg .Leq7\key_len
+ je .Leq6\key_len
+
+.Leq5\key_len:
+ do_aes_load 5, \key_len
+ add $(5*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq6\key_len:
+ do_aes_load 6, \key_len
+ add $(6*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Leq7\key_len:
+ do_aes_load 7, \key_len
+ add $(7*16), p_out
+ and $(~7*16), num_bytes
+ jz .Ldo_return2\key_len
+ jmp .Lmain_loop2\key_len
+
+.Lmult_of_8_blks\key_len:
+ .if (\key_len != KEY_128)
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 4*16(p_keys), xkey4
+ vmovdqa 8*16(p_keys), xkey8
+ vmovdqa 12*16(p_keys), xkey12
+ .else
+ vmovdqa 0*16(p_keys), xkey0
+ vmovdqa 3*16(p_keys), xkey4
+ vmovdqa 6*16(p_keys), xkey8
+ vmovdqa 9*16(p_keys), xkey12
+ .endif
+.align 16
+.Lmain_loop2\key_len:
+ /* num_bytes is a multiple of 8 and >0 */
+ do_aes_noload 8, \key_len
+ add $(8*16), p_out
+ sub $(8*16), num_bytes
+ jne .Lmain_loop2\key_len
+
+.Ldo_return2\key_len:
+ /* return updated IV */
+ vpshufb xbyteswap, xcounter, xcounter
+ vmovdqu xcounter, (p_iv)
+ ret
+.endm
+
+/*
+ * routine to do AES128 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_128_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_128
+
+ENDPROC(aes_ctr_enc_128_avx_by8)
+
+/*
+ * routine to do AES192 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_192_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_192
+
+ENDPROC(aes_ctr_enc_192_avx_by8)
+
+/*
+ * routine to do AES256 CTR enc/decrypt "by8"
+ * XMM registers are clobbered.
+ * Saving/restoring must be done at a higher level
+ * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
+ * unsigned int num_bytes)
+ */
+ENTRY(aes_ctr_enc_256_avx_by8)
+ /* call the aes main loop */
+ do_aes_ctrmain KEY_256
+
+ENDPROC(aes_ctr_enc_256_avx_by8)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 948ad0e7774..888950f29fd 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -105,6 +105,9 @@ void crypto_fpu_exit(void);
#define AVX_GEN4_OPTSIZE 4096
#ifdef CONFIG_X86_64
+
+static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
@@ -155,6 +158,12 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
#ifdef CONFIG_AS_AVX
+asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
+asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
+ void *keys, u8 *out, unsigned int num_bytes);
/*
* asmlinkage void aesni_gcm_precomp_avx_gen2()
* gcm_data *my_ctx_data, context data
@@ -472,6 +481,25 @@ static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
crypto_inc(ctrblk, AES_BLOCK_SIZE);
}
+#ifdef CONFIG_AS_AVX
+static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, unsigned int len, u8 *iv)
+{
+ /*
+ * based on key length, override with the by8 version
+ * of ctr mode encryption/decryption for improved performance
+ * aes_set_key_common() ensures that key length is one of
+ * {128,192,256}
+ */
+ if (ctx->key_length == AES_KEYSIZE_128)
+ aes_ctr_enc_128_avx_by8(in, iv, (void *)ctx, out, len);
+ else if (ctx->key_length == AES_KEYSIZE_192)
+ aes_ctr_enc_192_avx_by8(in, iv, (void *)ctx, out, len);
+ else
+ aes_ctr_enc_256_avx_by8(in, iv, (void *)ctx, out, len);
+}
+#endif
+
static int ctr_crypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
@@ -486,8 +514,8 @@ static int ctr_crypt(struct blkcipher_desc *desc,
kernel_fpu_begin();
while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
- aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK, walk.iv);
+ aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+ nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
@@ -1493,6 +1521,14 @@ static int __init aesni_init(void)
aesni_gcm_enc_tfm = aesni_gcm_enc;
aesni_gcm_dec_tfm = aesni_gcm_dec;
}
+ aesni_ctr_enc_tfm = aesni_ctr_enc;
+#ifdef CONFIG_AS_AVX
+ if (cpu_has_avx) {
+ /* optimize performance of ctr mode encryption transform */
+ aesni_ctr_enc_tfm = aesni_ctr_enc_avx_tfm;
+ pr_info("AES CTR mode by8 optimization enabled\n");
+ }
+#endif
#endif
err = crypto_fpu_init();
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index dbc4339b541..26d49ebae04 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -72,6 +72,7 @@
# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
+.text
ENTRY(crc_pcl)
#define bufp %rdi
#define bufp_dw %edi
@@ -216,15 +217,11 @@ LABEL crc_ %i
## 4) Combine three results:
################################################################
- lea (K_table-16)(%rip), bufp # first entry is for idx 1
+ lea (K_table-8)(%rip), bufp # first entry is for idx 1
shlq $3, %rax # rax *= 8
- subq %rax, tmp # tmp -= rax*8
- shlq $1, %rax
- subq %rax, tmp # tmp -= rax*16
- # (total tmp -= rax*24)
- addq %rax, bufp
-
- movdqa (bufp), %xmm0 # 2 consts: K1:K2
+ pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2
+ leal (%eax,%eax,2), %eax # rax *= 3 (total *24)
+ subq %rax, tmp # tmp -= rax*24
movq crc_init, %xmm1 # CRC for block 1
PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2
@@ -238,9 +235,9 @@ LABEL crc_ %i
mov crc2, crc_init
crc32 %rax, crc_init
-################################################################
-## 5) Check for end:
-################################################################
+ ################################################################
+ ## 5) Check for end:
+ ################################################################
LABEL crc_ 0
mov tmp, len
@@ -331,136 +328,136 @@ ENDPROC(crc_pcl)
################################################################
## PCLMULQDQ tables
- ## Table is 128 entries x 2 quad words each
+ ## Table is 128 entries x 2 words (8 bytes) each
################################################################
-.data
-.align 64
+.section .rotata, "a", %progbits
+.align 8
K_table:
- .quad 0x14cd00bd6,0x105ec76f0
- .quad 0x0ba4fc28e,0x14cd00bd6
- .quad 0x1d82c63da,0x0f20c0dfe
- .quad 0x09e4addf8,0x0ba4fc28e
- .quad 0x039d3b296,0x1384aa63a
- .quad 0x102f9b8a2,0x1d82c63da
- .quad 0x14237f5e6,0x01c291d04
- .quad 0x00d3b6092,0x09e4addf8
- .quad 0x0c96cfdc0,0x0740eef02
- .quad 0x18266e456,0x039d3b296
- .quad 0x0daece73e,0x0083a6eec
- .quad 0x0ab7aff2a,0x102f9b8a2
- .quad 0x1248ea574,0x1c1733996
- .quad 0x083348832,0x14237f5e6
- .quad 0x12c743124,0x02ad91c30
- .quad 0x0b9e02b86,0x00d3b6092
- .quad 0x018b33a4e,0x06992cea2
- .quad 0x1b331e26a,0x0c96cfdc0
- .quad 0x17d35ba46,0x07e908048
- .quad 0x1bf2e8b8a,0x18266e456
- .quad 0x1a3e0968a,0x11ed1f9d8
- .quad 0x0ce7f39f4,0x0daece73e
- .quad 0x061d82e56,0x0f1d0f55e
- .quad 0x0d270f1a2,0x0ab7aff2a
- .quad 0x1c3f5f66c,0x0a87ab8a8
- .quad 0x12ed0daac,0x1248ea574
- .quad 0x065863b64,0x08462d800
- .quad 0x11eef4f8e,0x083348832
- .quad 0x1ee54f54c,0x071d111a8
- .quad 0x0b3e32c28,0x12c743124
- .quad 0x0064f7f26,0x0ffd852c6
- .quad 0x0dd7e3b0c,0x0b9e02b86
- .quad 0x0f285651c,0x0dcb17aa4
- .quad 0x010746f3c,0x018b33a4e
- .quad 0x1c24afea4,0x0f37c5aee
- .quad 0x0271d9844,0x1b331e26a
- .quad 0x08e766a0c,0x06051d5a2
- .quad 0x093a5f730,0x17d35ba46
- .quad 0x06cb08e5c,0x11d5ca20e
- .quad 0x06b749fb2,0x1bf2e8b8a
- .quad 0x1167f94f2,0x021f3d99c
- .quad 0x0cec3662e,0x1a3e0968a
- .quad 0x19329634a,0x08f158014
- .quad 0x0e6fc4e6a,0x0ce7f39f4
- .quad 0x08227bb8a,0x1a5e82106
- .quad 0x0b0cd4768,0x061d82e56
- .quad 0x13c2b89c4,0x188815ab2
- .quad 0x0d7a4825c,0x0d270f1a2
- .quad 0x10f5ff2ba,0x105405f3e
- .quad 0x00167d312,0x1c3f5f66c
- .quad 0x0f6076544,0x0e9adf796
- .quad 0x026f6a60a,0x12ed0daac
- .quad 0x1a2adb74e,0x096638b34
- .quad 0x19d34af3a,0x065863b64
- .quad 0x049c3cc9c,0x1e50585a0
- .quad 0x068bce87a,0x11eef4f8e
- .quad 0x1524fa6c6,0x19f1c69dc
- .quad 0x16cba8aca,0x1ee54f54c
- .quad 0x042d98888,0x12913343e
- .quad 0x1329d9f7e,0x0b3e32c28
- .quad 0x1b1c69528,0x088f25a3a
- .quad 0x02178513a,0x0064f7f26
- .quad 0x0e0ac139e,0x04e36f0b0
- .quad 0x0170076fa,0x0dd7e3b0c
- .quad 0x141a1a2e2,0x0bd6f81f8
- .quad 0x16ad828b4,0x0f285651c
- .quad 0x041d17b64,0x19425cbba
- .quad 0x1fae1cc66,0x010746f3c
- .quad 0x1a75b4b00,0x18db37e8a
- .quad 0x0f872e54c,0x1c24afea4
- .quad 0x01e41e9fc,0x04c144932
- .quad 0x086d8e4d2,0x0271d9844
- .quad 0x160f7af7a,0x052148f02
- .quad 0x05bb8f1bc,0x08e766a0c
- .quad 0x0a90fd27a,0x0a3c6f37a
- .quad 0x0b3af077a,0x093a5f730
- .quad 0x04984d782,0x1d22c238e
- .quad 0x0ca6ef3ac,0x06cb08e5c
- .quad 0x0234e0b26,0x063ded06a
- .quad 0x1d88abd4a,0x06b749fb2
- .quad 0x04597456a,0x04d56973c
- .quad 0x0e9e28eb4,0x1167f94f2
- .quad 0x07b3ff57a,0x19385bf2e
- .quad 0x0c9c8b782,0x0cec3662e
- .quad 0x13a9cba9e,0x0e417f38a
- .quad 0x093e106a4,0x19329634a
- .quad 0x167001a9c,0x14e727980
- .quad 0x1ddffc5d4,0x0e6fc4e6a
- .quad 0x00df04680,0x0d104b8fc
- .quad 0x02342001e,0x08227bb8a
- .quad 0x00a2a8d7e,0x05b397730
- .quad 0x168763fa6,0x0b0cd4768
- .quad 0x1ed5a407a,0x0e78eb416
- .quad 0x0d2c3ed1a,0x13c2b89c4
- .quad 0x0995a5724,0x1641378f0
- .quad 0x19b1afbc4,0x0d7a4825c
- .quad 0x109ffedc0,0x08d96551c
- .quad 0x0f2271e60,0x10f5ff2ba
- .quad 0x00b0bf8ca,0x00bf80dd2
- .quad 0x123888b7a,0x00167d312
- .quad 0x1e888f7dc,0x18dcddd1c
- .quad 0x002ee03b2,0x0f6076544
- .quad 0x183e8d8fe,0x06a45d2b2
- .quad 0x133d7a042,0x026f6a60a
- .quad 0x116b0f50c,0x1dd3e10e8
- .quad 0x05fabe670,0x1a2adb74e
- .quad 0x130004488,0x0de87806c
- .quad 0x000bcf5f6,0x19d34af3a
- .quad 0x18f0c7078,0x014338754
- .quad 0x017f27698,0x049c3cc9c
- .quad 0x058ca5f00,0x15e3e77ee
- .quad 0x1af900c24,0x068bce87a
- .quad 0x0b5cfca28,0x0dd07448e
- .quad 0x0ded288f8,0x1524fa6c6
- .quad 0x059f229bc,0x1d8048348
- .quad 0x06d390dec,0x16cba8aca
- .quad 0x037170390,0x0a3e3e02c
- .quad 0x06353c1cc,0x042d98888
- .quad 0x0c4584f5c,0x0d73c7bea
- .quad 0x1f16a3418,0x1329d9f7e
- .quad 0x0531377e2,0x185137662
- .quad 0x1d8d9ca7c,0x1b1c69528
- .quad 0x0b25b29f2,0x18a08b5bc
- .quad 0x19fb2a8b0,0x02178513a
- .quad 0x1a08fe6ac,0x1da758ae0
- .quad 0x045cddf4e,0x0e0ac139e
- .quad 0x1a91647f2,0x169cf9eb0
- .quad 0x1a0f717c4,0x0170076fa
+ .long 0x493c7d27, 0x00000001
+ .long 0xba4fc28e, 0x493c7d27
+ .long 0xddc0152b, 0xf20c0dfe
+ .long 0x9e4addf8, 0xba4fc28e
+ .long 0x39d3b296, 0x3da6d0cb
+ .long 0x0715ce53, 0xddc0152b
+ .long 0x47db8317, 0x1c291d04
+ .long 0x0d3b6092, 0x9e4addf8
+ .long 0xc96cfdc0, 0x740eef02
+ .long 0x878a92a7, 0x39d3b296
+ .long 0xdaece73e, 0x083a6eec
+ .long 0xab7aff2a, 0x0715ce53
+ .long 0x2162d385, 0xc49f4f67
+ .long 0x83348832, 0x47db8317
+ .long 0x299847d5, 0x2ad91c30
+ .long 0xb9e02b86, 0x0d3b6092
+ .long 0x18b33a4e, 0x6992cea2
+ .long 0xb6dd949b, 0xc96cfdc0
+ .long 0x78d9ccb7, 0x7e908048
+ .long 0xbac2fd7b, 0x878a92a7
+ .long 0xa60ce07b, 0x1b3d8f29
+ .long 0xce7f39f4, 0xdaece73e
+ .long 0x61d82e56, 0xf1d0f55e
+ .long 0xd270f1a2, 0xab7aff2a
+ .long 0xc619809d, 0xa87ab8a8
+ .long 0x2b3cac5d, 0x2162d385
+ .long 0x65863b64, 0x8462d800
+ .long 0x1b03397f, 0x83348832
+ .long 0xebb883bd, 0x71d111a8
+ .long 0xb3e32c28, 0x299847d5
+ .long 0x064f7f26, 0xffd852c6
+ .long 0xdd7e3b0c, 0xb9e02b86
+ .long 0xf285651c, 0xdcb17aa4
+ .long 0x10746f3c, 0x18b33a4e
+ .long 0xc7a68855, 0xf37c5aee
+ .long 0x271d9844, 0xb6dd949b
+ .long 0x8e766a0c, 0x6051d5a2
+ .long 0x93a5f730, 0x78d9ccb7
+ .long 0x6cb08e5c, 0x18b0d4ff
+ .long 0x6b749fb2, 0xbac2fd7b
+ .long 0x1393e203, 0x21f3d99c
+ .long 0xcec3662e, 0xa60ce07b
+ .long 0x96c515bb, 0x8f158014
+ .long 0xe6fc4e6a, 0xce7f39f4
+ .long 0x8227bb8a, 0xa00457f7
+ .long 0xb0cd4768, 0x61d82e56
+ .long 0x39c7ff35, 0x8d6d2c43
+ .long 0xd7a4825c, 0xd270f1a2
+ .long 0x0ab3844b, 0x00ac29cf
+ .long 0x0167d312, 0xc619809d
+ .long 0xf6076544, 0xe9adf796
+ .long 0x26f6a60a, 0x2b3cac5d
+ .long 0xa741c1bf, 0x96638b34
+ .long 0x98d8d9cb, 0x65863b64
+ .long 0x49c3cc9c, 0xe0e9f351
+ .long 0x68bce87a, 0x1b03397f
+ .long 0x57a3d037, 0x9af01f2d
+ .long 0x6956fc3b, 0xebb883bd
+ .long 0x42d98888, 0x2cff42cf
+ .long 0x3771e98f, 0xb3e32c28
+ .long 0xb42ae3d9, 0x88f25a3a
+ .long 0x2178513a, 0x064f7f26
+ .long 0xe0ac139e, 0x4e36f0b0
+ .long 0x170076fa, 0xdd7e3b0c
+ .long 0x444dd413, 0xbd6f81f8
+ .long 0x6f345e45, 0xf285651c
+ .long 0x41d17b64, 0x91c9bd4b
+ .long 0xff0dba97, 0x10746f3c
+ .long 0xa2b73df1, 0x885f087b
+ .long 0xf872e54c, 0xc7a68855
+ .long 0x1e41e9fc, 0x4c144932
+ .long 0x86d8e4d2, 0x271d9844
+ .long 0x651bd98b, 0x52148f02
+ .long 0x5bb8f1bc, 0x8e766a0c
+ .long 0xa90fd27a, 0xa3c6f37a
+ .long 0xb3af077a, 0x93a5f730
+ .long 0x4984d782, 0xd7c0557f
+ .long 0xca6ef3ac, 0x6cb08e5c
+ .long 0x234e0b26, 0x63ded06a
+ .long 0xdd66cbbb, 0x6b749fb2
+ .long 0x4597456a, 0x4d56973c
+ .long 0xe9e28eb4, 0x1393e203
+ .long 0x7b3ff57a, 0x9669c9df
+ .long 0xc9c8b782, 0xcec3662e
+ .long 0x3f70cc6f, 0xe417f38a
+ .long 0x93e106a4, 0x96c515bb
+ .long 0x62ec6c6d, 0x4b9e0f71
+ .long 0xd813b325, 0xe6fc4e6a
+ .long 0x0df04680, 0xd104b8fc
+ .long 0x2342001e, 0x8227bb8a
+ .long 0x0a2a8d7e, 0x5b397730
+ .long 0x6d9a4957, 0xb0cd4768
+ .long 0xe8b6368b, 0xe78eb416
+ .long 0xd2c3ed1a, 0x39c7ff35
+ .long 0x995a5724, 0x61ff0e01
+ .long 0x9ef68d35, 0xd7a4825c
+ .long 0x0c139b31, 0x8d96551c
+ .long 0xf2271e60, 0x0ab3844b
+ .long 0x0b0bf8ca, 0x0bf80dd2
+ .long 0x2664fd8b, 0x0167d312
+ .long 0xed64812d, 0x8821abed
+ .long 0x02ee03b2, 0xf6076544
+ .long 0x8604ae0f, 0x6a45d2b2
+ .long 0x363bd6b3, 0x26f6a60a
+ .long 0x135c83fd, 0xd8d26619
+ .long 0x5fabe670, 0xa741c1bf
+ .long 0x35ec3279, 0xde87806c
+ .long 0x00bcf5f6, 0x98d8d9cb
+ .long 0x8ae00689, 0x14338754
+ .long 0x17f27698, 0x49c3cc9c
+ .long 0x58ca5f00, 0x5bd2011f
+ .long 0xaa7c7ad5, 0x68bce87a
+ .long 0xb5cfca28, 0xdd07448e
+ .long 0xded288f8, 0x57a3d037
+ .long 0x59f229bc, 0xdde8f5b9
+ .long 0x6d390dec, 0x6956fc3b
+ .long 0x37170390, 0xa3e3e02c
+ .long 0x6353c1cc, 0x42d98888
+ .long 0xc4584f5c, 0xd73c7bea
+ .long 0xf48642e9, 0x3771e98f
+ .long 0x531377e2, 0x80ff0093
+ .long 0xdd35bc8d, 0xb42ae3d9
+ .long 0xb25b29f2, 0x8fe4c34d
+ .long 0x9a5ede41, 0x2178513a
+ .long 0xa563905d, 0xdf99fc11
+ .long 0x45cddf4e, 0xe0ac139e
+ .long 0xacfa3103, 0x6c23e841
+ .long 0xa51b6135, 0x170076fa
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
new file mode 100644
index 00000000000..038f6ae87c5
--- /dev/null
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -0,0 +1,805 @@
+/*
+ * des3_ede-asm_64.S - x86-64 assembly implementation of 3DES cipher
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/linkage.h>
+
+.file "des3_ede-asm_64.S"
+.text
+
+#define s1 .L_s1
+#define s2 ((s1) + (64*8))
+#define s3 ((s2) + (64*8))
+#define s4 ((s3) + (64*8))
+#define s5 ((s4) + (64*8))
+#define s6 ((s5) + (64*8))
+#define s7 ((s6) + (64*8))
+#define s8 ((s7) + (64*8))
+
+/* register macros */
+#define CTX %rdi
+
+#define RL0 %r8
+#define RL1 %r9
+#define RL2 %r10
+
+#define RL0d %r8d
+#define RL1d %r9d
+#define RL2d %r10d
+
+#define RR0 %r11
+#define RR1 %r12
+#define RR2 %r13
+
+#define RR0d %r11d
+#define RR1d %r12d
+#define RR2d %r13d
+
+#define RW0 %rax
+#define RW1 %rbx
+#define RW2 %rcx
+
+#define RW0d %eax
+#define RW1d %ebx
+#define RW2d %ecx
+
+#define RW0bl %al
+#define RW1bl %bl
+#define RW2bl %cl
+
+#define RW0bh %ah
+#define RW1bh %bh
+#define RW2bh %ch
+
+#define RT0 %r15
+#define RT1 %rbp
+#define RT2 %r14
+#define RT3 %rdx
+
+#define RT0d %r15d
+#define RT1d %ebp
+#define RT2d %r14d
+#define RT3d %edx
+
+/***********************************************************************
+ * 1-way 3DES
+ ***********************************************************************/
+#define do_permutation(a, b, offset, mask) \
+ movl a, RT0d; \
+ shrl $(offset), RT0d; \
+ xorl b, RT0d; \
+ andl $(mask), RT0d; \
+ xorl RT0d, b; \
+ shll $(offset), RT0d; \
+ xorl RT0d, a;
+
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation(left, right) \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ movl left##d, RW0d; \
+ roll $1, right##d; \
+ xorl right##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##d; \
+ xorl RW0d, right##d; \
+ roll $1, left##d; \
+ expand_to_64bits(right, RT3); \
+ expand_to_64bits(left, RT3);
+
+#define final_permutation(left, right) \
+ compress_to_64bits(right); \
+ compress_to_64bits(left); \
+ movl right##d, RW0d; \
+ rorl $1, left##d; \
+ xorl left##d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##d; \
+ xorl RW0d, left##d; \
+ rorl $1, right##d; \
+ do_permutation(right##d, left##d, 8, 0x00ff00ff); \
+ do_permutation(right##d, left##d, 2, 0x33333333); \
+ do_permutation(left##d, right##d, 16, 0x0000ffff); \
+ do_permutation(left##d, right##d, 4, 0x0f0f0f0f);
+
+#define round1(n, from, to, load_next_key) \
+ xorq from, RW0; \
+ \
+ movzbl RW0bl, RT0d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ shrq $16, RW0; \
+ movq s8(, RT0, 8), RT0; \
+ xorq s6(, RT1, 8), to; \
+ movzbl RW0bl, RL1d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s4(, RT2, 8), RT0; \
+ xorq s2(, RT3, 8), to; \
+ movzbl RW0bl, RT2d; \
+ movzbl RW0bh, RT3d; \
+ xorq s7(, RL1, 8), RT0; \
+ xorq s5(, RT1, 8), to; \
+ xorq s3(, RT2, 8), RT0; \
+ load_next_key(n, RW0); \
+ xorq RT0, to; \
+ xorq s1(, RT3, 8), to; \
+
+#define load_next_key(n, RWx) \
+ movq (((n) + 1) * 8)(CTX), RWx;
+
+#define dummy2(a, b) /*_*/
+
+#define read_block(io, left, right) \
+ movl (io), left##d; \
+ movl 4(io), right##d; \
+ bswapl left##d; \
+ bswapl right##d;
+
+#define write_block(io, left, right) \
+ bswapl left##d; \
+ bswapl right##d; \
+ movl left##d, (io); \
+ movl right##d, 4(io);
+
+ENTRY(des3_ede_x86_64_crypt_blk)
+ /* input:
+ * %rdi: round keys, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ read_block(%rdx, RL0, RR0);
+ initial_permutation(RL0, RR0);
+
+ movq (CTX), RW0;
+
+ round1(0, RR0, RL0, load_next_key);
+ round1(1, RL0, RR0, load_next_key);
+ round1(2, RR0, RL0, load_next_key);
+ round1(3, RL0, RR0, load_next_key);
+ round1(4, RR0, RL0, load_next_key);
+ round1(5, RL0, RR0, load_next_key);
+ round1(6, RR0, RL0, load_next_key);
+ round1(7, RL0, RR0, load_next_key);
+ round1(8, RR0, RL0, load_next_key);
+ round1(9, RL0, RR0, load_next_key);
+ round1(10, RR0, RL0, load_next_key);
+ round1(11, RL0, RR0, load_next_key);
+ round1(12, RR0, RL0, load_next_key);
+ round1(13, RL0, RR0, load_next_key);
+ round1(14, RR0, RL0, load_next_key);
+ round1(15, RL0, RR0, load_next_key);
+
+ round1(16+0, RL0, RR0, load_next_key);
+ round1(16+1, RR0, RL0, load_next_key);
+ round1(16+2, RL0, RR0, load_next_key);
+ round1(16+3, RR0, RL0, load_next_key);
+ round1(16+4, RL0, RR0, load_next_key);
+ round1(16+5, RR0, RL0, load_next_key);
+ round1(16+6, RL0, RR0, load_next_key);
+ round1(16+7, RR0, RL0, load_next_key);
+ round1(16+8, RL0, RR0, load_next_key);
+ round1(16+9, RR0, RL0, load_next_key);
+ round1(16+10, RL0, RR0, load_next_key);
+ round1(16+11, RR0, RL0, load_next_key);
+ round1(16+12, RL0, RR0, load_next_key);
+ round1(16+13, RR0, RL0, load_next_key);
+ round1(16+14, RL0, RR0, load_next_key);
+ round1(16+15, RR0, RL0, load_next_key);
+
+ round1(32+0, RR0, RL0, load_next_key);
+ round1(32+1, RL0, RR0, load_next_key);
+ round1(32+2, RR0, RL0, load_next_key);
+ round1(32+3, RL0, RR0, load_next_key);
+ round1(32+4, RR0, RL0, load_next_key);
+ round1(32+5, RL0, RR0, load_next_key);
+ round1(32+6, RR0, RL0, load_next_key);
+ round1(32+7, RL0, RR0, load_next_key);
+ round1(32+8, RR0, RL0, load_next_key);
+ round1(32+9, RL0, RR0, load_next_key);
+ round1(32+10, RR0, RL0, load_next_key);
+ round1(32+11, RL0, RR0, load_next_key);
+ round1(32+12, RR0, RL0, load_next_key);
+ round1(32+13, RL0, RR0, load_next_key);
+ round1(32+14, RR0, RL0, load_next_key);
+ round1(32+15, RL0, RR0, dummy2);
+
+ final_permutation(RR0, RL0);
+ write_block(%rsi, RR0, RL0);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk)
+
+/***********************************************************************
+ * 3-way 3DES
+ ***********************************************************************/
+#define expand_to_64bits(val, mask) \
+ movl val##d, RT0d; \
+ rorl $4, RT0d; \
+ shlq $32, RT0; \
+ orq RT0, val; \
+ andq mask, val;
+
+#define compress_to_64bits(val) \
+ movq val, RT0; \
+ shrq $32, RT0; \
+ roll $4, RT0d; \
+ orl RT0d, val##d;
+
+#define initial_permutation3(left, right) \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ \
+ movabs $0x3f3f3f3f3f3f3f3f, RT3; \
+ \
+ movl left##0d, RW0d; \
+ roll $1, right##0d; \
+ xorl right##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, left##0d; \
+ xorl RW0d, right##0d; \
+ roll $1, left##0d; \
+ expand_to_64bits(right##0, RT3); \
+ expand_to_64bits(left##0, RT3); \
+ movl left##1d, RW1d; \
+ roll $1, right##1d; \
+ xorl right##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, left##1d; \
+ xorl RW1d, right##1d; \
+ roll $1, left##1d; \
+ expand_to_64bits(right##1, RT3); \
+ expand_to_64bits(left##1, RT3); \
+ movl left##2d, RW2d; \
+ roll $1, right##2d; \
+ xorl right##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, left##2d; \
+ xorl RW2d, right##2d; \
+ roll $1, left##2d; \
+ expand_to_64bits(right##2, RT3); \
+ expand_to_64bits(left##2, RT3);
+
+#define final_permutation3(left, right) \
+ compress_to_64bits(right##0); \
+ compress_to_64bits(left##0); \
+ movl right##0d, RW0d; \
+ rorl $1, left##0d; \
+ xorl left##0d, RW0d; \
+ andl $0xaaaaaaaa, RW0d; \
+ xorl RW0d, right##0d; \
+ xorl RW0d, left##0d; \
+ rorl $1, right##0d; \
+ compress_to_64bits(right##1); \
+ compress_to_64bits(left##1); \
+ movl right##1d, RW1d; \
+ rorl $1, left##1d; \
+ xorl left##1d, RW1d; \
+ andl $0xaaaaaaaa, RW1d; \
+ xorl RW1d, right##1d; \
+ xorl RW1d, left##1d; \
+ rorl $1, right##1d; \
+ compress_to_64bits(right##2); \
+ compress_to_64bits(left##2); \
+ movl right##2d, RW2d; \
+ rorl $1, left##2d; \
+ xorl left##2d, RW2d; \
+ andl $0xaaaaaaaa, RW2d; \
+ xorl RW2d, right##2d; \
+ xorl RW2d, left##2d; \
+ rorl $1, right##2d; \
+ \
+ do_permutation(right##0d, left##0d, 8, 0x00ff00ff); \
+ do_permutation(right##0d, left##0d, 2, 0x33333333); \
+ do_permutation(right##1d, left##1d, 8, 0x00ff00ff); \
+ do_permutation(right##1d, left##1d, 2, 0x33333333); \
+ do_permutation(right##2d, left##2d, 8, 0x00ff00ff); \
+ do_permutation(right##2d, left##2d, 2, 0x33333333); \
+ \
+ do_permutation(left##0d, right##0d, 16, 0x0000ffff); \
+ do_permutation(left##0d, right##0d, 4, 0x0f0f0f0f); \
+ do_permutation(left##1d, right##1d, 16, 0x0000ffff); \
+ do_permutation(left##1d, right##1d, 4, 0x0f0f0f0f); \
+ do_permutation(left##2d, right##2d, 16, 0x0000ffff); \
+ do_permutation(left##2d, right##2d, 4, 0x0f0f0f0f);
+
+#define round3(n, from, to, load_next_key, do_movq) \
+ xorq from##0, RW0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s8(, RT3, 8), to##0; \
+ xorq s6(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrq $16, RW0; \
+ xorq s4(, RT3, 8), to##0; \
+ xorq s2(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ shrl $16, RW0d; \
+ xorq s7(, RT3, 8), to##0; \
+ xorq s5(, RT1, 8), to##0; \
+ movzbl RW0bl, RT3d; \
+ movzbl RW0bh, RT1d; \
+ load_next_key(n, RW0); \
+ xorq s3(, RT3, 8), to##0; \
+ xorq s1(, RT1, 8), to##0; \
+ xorq from##1, RW1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s8(, RT3, 8), to##1; \
+ xorq s6(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrq $16, RW1; \
+ xorq s4(, RT3, 8), to##1; \
+ xorq s2(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ shrl $16, RW1d; \
+ xorq s7(, RT3, 8), to##1; \
+ xorq s5(, RT1, 8), to##1; \
+ movzbl RW1bl, RT3d; \
+ movzbl RW1bh, RT1d; \
+ do_movq(RW0, RW1); \
+ xorq s3(, RT3, 8), to##1; \
+ xorq s1(, RT1, 8), to##1; \
+ xorq from##2, RW2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s8(, RT3, 8), to##2; \
+ xorq s6(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrq $16, RW2; \
+ xorq s4(, RT3, 8), to##2; \
+ xorq s2(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ shrl $16, RW2d; \
+ xorq s7(, RT3, 8), to##2; \
+ xorq s5(, RT1, 8), to##2; \
+ movzbl RW2bl, RT3d; \
+ movzbl RW2bh, RT1d; \
+ do_movq(RW0, RW2); \
+ xorq s3(, RT3, 8), to##2; \
+ xorq s1(, RT1, 8), to##2;
+
+#define __movq(src, dst) \
+ movq src, dst;
+
+ENTRY(des3_ede_x86_64_crypt_blk_3way)
+ /* input:
+ * %rdi: ctx, round keys
+ * %rsi: dst (3 blocks)
+ * %rdx: src (3 blocks)
+ */
+
+ pushq %rbp;
+ pushq %rbx;
+ pushq %r12;
+ pushq %r13;
+ pushq %r14;
+ pushq %r15;
+
+ /* load input */
+ movl 0 * 4(%rdx), RL0d;
+ movl 1 * 4(%rdx), RR0d;
+ movl 2 * 4(%rdx), RL1d;
+ movl 3 * 4(%rdx), RR1d;
+ movl 4 * 4(%rdx), RL2d;
+ movl 5 * 4(%rdx), RR2d;
+
+ bswapl RL0d;
+ bswapl RR0d;
+ bswapl RL1d;
+ bswapl RR1d;
+ bswapl RL2d;
+ bswapl RR2d;
+
+ initial_permutation3(RL, RR);
+
+ movq 0(CTX), RW0;
+ movq RW0, RW1;
+ movq RW0, RW2;
+
+ round3(0, RR, RL, load_next_key, __movq);
+ round3(1, RL, RR, load_next_key, __movq);
+ round3(2, RR, RL, load_next_key, __movq);
+ round3(3, RL, RR, load_next_key, __movq);
+ round3(4, RR, RL, load_next_key, __movq);
+ round3(5, RL, RR, load_next_key, __movq);
+ round3(6, RR, RL, load_next_key, __movq);
+ round3(7, RL, RR, load_next_key, __movq);
+ round3(8, RR, RL, load_next_key, __movq);
+ round3(9, RL, RR, load_next_key, __movq);
+ round3(10, RR, RL, load_next_key, __movq);
+ round3(11, RL, RR, load_next_key, __movq);
+ round3(12, RR, RL, load_next_key, __movq);
+ round3(13, RL, RR, load_next_key, __movq);
+ round3(14, RR, RL, load_next_key, __movq);
+ round3(15, RL, RR, load_next_key, __movq);
+
+ round3(16+0, RL, RR, load_next_key, __movq);
+ round3(16+1, RR, RL, load_next_key, __movq);
+ round3(16+2, RL, RR, load_next_key, __movq);
+ round3(16+3, RR, RL, load_next_key, __movq);
+ round3(16+4, RL, RR, load_next_key, __movq);
+ round3(16+5, RR, RL, load_next_key, __movq);
+ round3(16+6, RL, RR, load_next_key, __movq);
+ round3(16+7, RR, RL, load_next_key, __movq);
+ round3(16+8, RL, RR, load_next_key, __movq);
+ round3(16+9, RR, RL, load_next_key, __movq);
+ round3(16+10, RL, RR, load_next_key, __movq);
+ round3(16+11, RR, RL, load_next_key, __movq);
+ round3(16+12, RL, RR, load_next_key, __movq);
+ round3(16+13, RR, RL, load_next_key, __movq);
+ round3(16+14, RL, RR, load_next_key, __movq);
+ round3(16+15, RR, RL, load_next_key, __movq);
+
+ round3(32+0, RR, RL, load_next_key, __movq);
+ round3(32+1, RL, RR, load_next_key, __movq);
+ round3(32+2, RR, RL, load_next_key, __movq);
+ round3(32+3, RL, RR, load_next_key, __movq);
+ round3(32+4, RR, RL, load_next_key, __movq);
+ round3(32+5, RL, RR, load_next_key, __movq);
+ round3(32+6, RR, RL, load_next_key, __movq);
+ round3(32+7, RL, RR, load_next_key, __movq);
+ round3(32+8, RR, RL, load_next_key, __movq);
+ round3(32+9, RL, RR, load_next_key, __movq);
+ round3(32+10, RR, RL, load_next_key, __movq);
+ round3(32+11, RL, RR, load_next_key, __movq);
+ round3(32+12, RR, RL, load_next_key, __movq);
+ round3(32+13, RL, RR, load_next_key, __movq);
+ round3(32+14, RR, RL, load_next_key, __movq);
+ round3(32+15, RL, RR, dummy2, dummy2);
+
+ final_permutation3(RR, RL);
+
+ bswapl RR0d;
+ bswapl RL0d;
+ bswapl RR1d;
+ bswapl RL1d;
+ bswapl RR2d;
+ bswapl RL2d;
+
+ movl RR0d, 0 * 4(%rsi);
+ movl RL0d, 1 * 4(%rsi);
+ movl RR1d, 2 * 4(%rsi);
+ movl RL1d, 3 * 4(%rsi);
+ movl RR2d, 4 * 4(%rsi);
+ movl RL2d, 5 * 4(%rsi);
+
+ popq %r15;
+ popq %r14;
+ popq %r13;
+ popq %r12;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+ENDPROC(des3_ede_x86_64_crypt_blk_3way)
+
+.data
+.align 16
+.L_s1:
+ .quad 0x0010100001010400, 0x0000000000000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0010100001010004, 0x0000100000010404
+ .quad 0x0000000000000004, 0x0000100000010000
+ .quad 0x0000000000000400, 0x0010100001010400
+ .quad 0x0010100001010404, 0x0000000000000400
+ .quad 0x0010000001000404, 0x0010100001010004
+ .quad 0x0010000001000000, 0x0000000000000004
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000100000010400
+ .quad 0x0000100000010400, 0x0010100001010000
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0000100000010004, 0x0010000001000004
+ .quad 0x0010000001000004, 0x0000100000010004
+ .quad 0x0000000000000000, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010000001000000
+ .quad 0x0000100000010000, 0x0010100001010404
+ .quad 0x0000000000000004, 0x0010100001010000
+ .quad 0x0010100001010400, 0x0010000001000000
+ .quad 0x0010000001000000, 0x0000000000000400
+ .quad 0x0010100001010004, 0x0000100000010000
+ .quad 0x0000100000010400, 0x0010000001000004
+ .quad 0x0000000000000400, 0x0000000000000004
+ .quad 0x0010000001000404, 0x0000100000010404
+ .quad 0x0010100001010404, 0x0000100000010004
+ .quad 0x0010100001010000, 0x0010000001000404
+ .quad 0x0010000001000004, 0x0000000000000404
+ .quad 0x0000100000010404, 0x0010100001010400
+ .quad 0x0000000000000404, 0x0010000001000400
+ .quad 0x0010000001000400, 0x0000000000000000
+ .quad 0x0000100000010004, 0x0000100000010400
+ .quad 0x0000000000000000, 0x0010100001010004
+.L_s2:
+ .quad 0x0801080200100020, 0x0800080000000000
+ .quad 0x0000080000000000, 0x0001080200100020
+ .quad 0x0001000000100000, 0x0000000200000020
+ .quad 0x0801000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0801080200100020
+ .quad 0x0801080000100000, 0x0800000000000000
+ .quad 0x0800080000000000, 0x0001000000100000
+ .quad 0x0000000200000020, 0x0801000200100020
+ .quad 0x0001080000100000, 0x0001000200100020
+ .quad 0x0800080200000020, 0x0000000000000000
+ .quad 0x0800000000000000, 0x0000080000000000
+ .quad 0x0001080200100020, 0x0801000000100000
+ .quad 0x0001000200100020, 0x0800000200000020
+ .quad 0x0000000000000000, 0x0001080000100000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0801000000100000, 0x0000080200000020
+ .quad 0x0000000000000000, 0x0001080200100020
+ .quad 0x0801000200100020, 0x0001000000100000
+ .quad 0x0800080200000020, 0x0801000000100000
+ .quad 0x0801080000100000, 0x0000080000000000
+ .quad 0x0801000000100000, 0x0800080000000000
+ .quad 0x0000000200000020, 0x0801080200100020
+ .quad 0x0001080200100020, 0x0000000200000020
+ .quad 0x0000080000000000, 0x0800000000000000
+ .quad 0x0000080200000020, 0x0801080000100000
+ .quad 0x0001000000100000, 0x0800000200000020
+ .quad 0x0001000200100020, 0x0800080200000020
+ .quad 0x0800000200000020, 0x0001000200100020
+ .quad 0x0001080000100000, 0x0000000000000000
+ .quad 0x0800080000000000, 0x0000080200000020
+ .quad 0x0800000000000000, 0x0801000200100020
+ .quad 0x0801080200100020, 0x0001080000100000
+.L_s3:
+ .quad 0x0000002000000208, 0x0000202008020200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000202000020208, 0x0000002008000200
+ .quad 0x0000200000020008, 0x0000000008000008
+ .quad 0x0000000008000008, 0x0000200000020000
+ .quad 0x0000202008020208, 0x0000200000020008
+ .quad 0x0000200008020000, 0x0000002000000208
+ .quad 0x0000000008000000, 0x0000000000000008
+ .quad 0x0000202008020200, 0x0000002000000200
+ .quad 0x0000202000020200, 0x0000200008020000
+ .quad 0x0000200008020008, 0x0000202000020208
+ .quad 0x0000002008000208, 0x0000202000020200
+ .quad 0x0000200000020000, 0x0000002008000208
+ .quad 0x0000000000000008, 0x0000202008020208
+ .quad 0x0000002000000200, 0x0000000008000000
+ .quad 0x0000202008020200, 0x0000000008000000
+ .quad 0x0000200000020008, 0x0000002000000208
+ .quad 0x0000200000020000, 0x0000202008020200
+ .quad 0x0000002008000200, 0x0000000000000000
+ .quad 0x0000002000000200, 0x0000200000020008
+ .quad 0x0000202008020208, 0x0000002008000200
+ .quad 0x0000000008000008, 0x0000002000000200
+ .quad 0x0000000000000000, 0x0000200008020008
+ .quad 0x0000002008000208, 0x0000200000020000
+ .quad 0x0000000008000000, 0x0000202008020208
+ .quad 0x0000000000000008, 0x0000202000020208
+ .quad 0x0000202000020200, 0x0000000008000008
+ .quad 0x0000200008020000, 0x0000002008000208
+ .quad 0x0000002000000208, 0x0000200008020000
+ .quad 0x0000202000020208, 0x0000000000000008
+ .quad 0x0000200008020008, 0x0000202000020200
+.L_s4:
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x0000020000002000, 0x0008020800002000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x0008000800000000, 0x1008000000000001
+ .quad 0x0008020000002000, 0x1008020800002001
+ .quad 0x1000000800000001, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0008020000002000
+ .quad 0x0000020800002000, 0x0008000800000000
+ .quad 0x1008000800000001, 0x1000000000000001
+ .quad 0x1008020000002001, 0x1000020800002001
+ .quad 0x1000020800002001, 0x0000000800000000
+ .quad 0x1008020800002001, 0x1000000800000001
+ .quad 0x1000000000000001, 0x0000020000002000
+ .quad 0x1008000000000001, 0x1000020000002001
+ .quad 0x0008020800002000, 0x1008000800000001
+ .quad 0x1000020000002001, 0x0000020800002000
+ .quad 0x0008000000000000, 0x1008020000002001
+ .quad 0x0000000800000000, 0x0008000000000000
+ .quad 0x0000020000002000, 0x0008020800002000
+.L_s5:
+ .quad 0x0000001000000100, 0x0020001002080100
+ .quad 0x0020000002080000, 0x0420001002000100
+ .quad 0x0000000000080000, 0x0000001000000100
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0400001000080100, 0x0000000000080000
+ .quad 0x0020001002000100, 0x0400001000080100
+ .quad 0x0420001002000100, 0x0420000002080000
+ .quad 0x0000001000080100, 0x0400000000000000
+ .quad 0x0020000002000000, 0x0400000000080000
+ .quad 0x0400000000080000, 0x0000000000000000
+ .quad 0x0400001000000100, 0x0420001002080100
+ .quad 0x0420001002080100, 0x0020001002000100
+ .quad 0x0420000002080000, 0x0400001000000100
+ .quad 0x0000000000000000, 0x0420000002000000
+ .quad 0x0020001002080100, 0x0020000002000000
+ .quad 0x0420000002000000, 0x0000001000080100
+ .quad 0x0000000000080000, 0x0420001002000100
+ .quad 0x0000001000000100, 0x0020000002000000
+ .quad 0x0400000000000000, 0x0020000002080000
+ .quad 0x0420001002000100, 0x0400001000080100
+ .quad 0x0020001002000100, 0x0400000000000000
+ .quad 0x0420000002080000, 0x0020001002080100
+ .quad 0x0400001000080100, 0x0000001000000100
+ .quad 0x0020000002000000, 0x0420000002080000
+ .quad 0x0420001002080100, 0x0000001000080100
+ .quad 0x0420000002000000, 0x0420001002080100
+ .quad 0x0020000002080000, 0x0000000000000000
+ .quad 0x0400000000080000, 0x0420000002000000
+ .quad 0x0000001000080100, 0x0020001002000100
+ .quad 0x0400001000000100, 0x0000000000080000
+ .quad 0x0000000000000000, 0x0400000000080000
+ .quad 0x0020001002080100, 0x0400001000000100
+.L_s6:
+ .quad 0x0200000120000010, 0x0204000020000000
+ .quad 0x0000040000000000, 0x0204040120000010
+ .quad 0x0204000020000000, 0x0000000100000010
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0200040020000000, 0x0004040100000010
+ .quad 0x0004000000000000, 0x0200000120000010
+ .quad 0x0004000100000010, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0000000000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000040000000000
+ .quad 0x0004040000000000, 0x0200040120000010
+ .quad 0x0000000100000010, 0x0204000120000010
+ .quad 0x0204000120000010, 0x0000000000000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000040100000010, 0x0004040000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0200040020000000, 0x0000000100000010
+ .quad 0x0204000120000010, 0x0004040000000000
+ .quad 0x0204040120000010, 0x0004000000000000
+ .quad 0x0000040100000010, 0x0200000120000010
+ .quad 0x0004000000000000, 0x0200040020000000
+ .quad 0x0200000020000000, 0x0000040100000010
+ .quad 0x0200000120000010, 0x0204040120000010
+ .quad 0x0004040000000000, 0x0204000020000000
+ .quad 0x0004040100000010, 0x0204040020000000
+ .quad 0x0000000000000000, 0x0204000120000010
+ .quad 0x0000000100000010, 0x0000040000000000
+ .quad 0x0204000020000000, 0x0004040100000010
+ .quad 0x0000040000000000, 0x0004000100000010
+ .quad 0x0200040120000010, 0x0000000000000000
+ .quad 0x0204040020000000, 0x0200000020000000
+ .quad 0x0004000100000010, 0x0200040120000010
+.L_s7:
+ .quad 0x0002000000200000, 0x2002000004200002
+ .quad 0x2000000004000802, 0x0000000000000000
+ .quad 0x0000000000000800, 0x2000000004000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2002000004200802, 0x0002000000200000
+ .quad 0x0000000000000000, 0x2000000004000002
+ .quad 0x2000000000000002, 0x0000000004000000
+ .quad 0x2002000004200002, 0x2000000000000802
+ .quad 0x0000000004000800, 0x2002000000200802
+ .quad 0x2002000000200002, 0x0000000004000800
+ .quad 0x2000000004000002, 0x0002000004200000
+ .quad 0x0002000004200800, 0x2002000000200002
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000000000802, 0x2002000004200802
+ .quad 0x0002000000200800, 0x2000000000000002
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0000000004000000, 0x0002000000200800
+ .quad 0x0002000000200000, 0x2000000004000802
+ .quad 0x2000000004000802, 0x2002000004200002
+ .quad 0x2002000004200002, 0x2000000000000002
+ .quad 0x2002000000200002, 0x0000000004000000
+ .quad 0x0000000004000800, 0x0002000000200000
+ .quad 0x0002000004200800, 0x2000000000000802
+ .quad 0x2002000000200802, 0x0002000004200800
+ .quad 0x2000000000000802, 0x2000000004000002
+ .quad 0x2002000004200802, 0x0002000004200000
+ .quad 0x0002000000200800, 0x0000000000000000
+ .quad 0x2000000000000002, 0x2002000004200802
+ .quad 0x0000000000000000, 0x2002000000200802
+ .quad 0x0002000004200000, 0x0000000000000800
+ .quad 0x2000000004000002, 0x0000000004000800
+ .quad 0x0000000000000800, 0x2002000000200002
+.L_s8:
+ .quad 0x0100010410001000, 0x0000010000001000
+ .quad 0x0000000000040000, 0x0100010410041000
+ .quad 0x0100000010000000, 0x0100010410001000
+ .quad 0x0000000400000000, 0x0100000010000000
+ .quad 0x0000000400040000, 0x0100000010040000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0100010010041000, 0x0000010400041000
+ .quad 0x0000010000001000, 0x0000000400000000
+ .quad 0x0100000010040000, 0x0100000410000000
+ .quad 0x0100010010001000, 0x0000010400001000
+ .quad 0x0000010000041000, 0x0000000400040000
+ .quad 0x0100000410040000, 0x0100010010041000
+ .quad 0x0000010400001000, 0x0000000000000000
+ .quad 0x0000000000000000, 0x0100000410040000
+ .quad 0x0100000410000000, 0x0100010010001000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0000010400041000, 0x0000000000040000
+ .quad 0x0100010010041000, 0x0000010000001000
+ .quad 0x0000000400000000, 0x0100000410040000
+ .quad 0x0000010000001000, 0x0000010400041000
+ .quad 0x0100010010001000, 0x0000000400000000
+ .quad 0x0100000410000000, 0x0100000010040000
+ .quad 0x0100000410040000, 0x0100000010000000
+ .quad 0x0000000000040000, 0x0100010410001000
+ .quad 0x0000000000000000, 0x0100010410041000
+ .quad 0x0000000400040000, 0x0100000410000000
+ .quad 0x0100000010040000, 0x0100010010001000
+ .quad 0x0100010410001000, 0x0000000000000000
+ .quad 0x0100010410041000, 0x0000010000041000
+ .quad 0x0000010000041000, 0x0000010400001000
+ .quad 0x0000010400001000, 0x0000000400040000
+ .quad 0x0100000010000000, 0x0100010010041000
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
new file mode 100644
index 00000000000..0e9c0668fe4
--- /dev/null
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -0,0 +1,509 @@
+/*
+ * Glue Code for assembler optimized version of 3DES
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <asm/processor.h>
+#include <crypto/des.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+
+struct des3_ede_x86_ctx {
+ u32 enc_expkey[DES3_EDE_EXPKEY_WORDS];
+ u32 dec_expkey[DES3_EDE_EXPKEY_WORDS];
+};
+
+/* regular block cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+/* 3-way parallel cipher functions */
+asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst,
+ const u8 *src);
+
+static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc_expkey;
+
+ des3_ede_x86_64_crypt_blk(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec_expkey;
+
+ des3_ede_x86_64_crypt_blk(dec_ctx, dst, src);
+}
+
+static inline void des3_ede_enc_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *enc_ctx = ctx->enc_expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(enc_ctx, dst, src);
+}
+
+static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ u32 *dec_ctx = ctx->dec_expkey;
+
+ des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src);
+}
+
+static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src);
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+ const u32 *expkey)
+{
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes;
+ int err;
+
+ err = blkcipher_walk_virt(desc, walk);
+
+ while ((nbytes = walk->nbytes)) {
+ u8 *wsrc = walk->src.virt.addr;
+ u8 *wdst = walk->dst.virt.addr;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ des3_ede_x86_64_crypt_blk_3way(expkey, wdst,
+ wsrc);
+
+ wsrc += bsize * 3;
+ wdst += bsize * 3;
+ nbytes -= bsize * 3;
+ } while (nbytes >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = blkcipher_walk_done(desc, walk, nbytes);
+ }
+
+ return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, ctx->enc_expkey);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, ctx->dec_expkey);
+}
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 *iv = (u64 *)walk->iv;
+
+ do {
+ *dst = *src ^ *iv;
+ des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ *(u64 *)walk->iv = *iv;
+ return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u64 *src = (u64 *)walk->src.virt.addr;
+ u64 *dst = (u64 *)walk->dst.virt.addr;
+ u64 ivs[3 - 1];
+ u64 last_iv;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ nbytes -= bsize * 3 - bsize;
+ src -= 3 - 1;
+ dst -= 3 - 1;
+
+ ivs[0] = src[0];
+ ivs[1] = src[1];
+
+ des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src);
+
+ dst[1] ^= ivs[0];
+ dst[2] ^= ivs[1];
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * 3);
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ *dst ^= *(src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ *dst ^= *(u64 *)walk->iv;
+ *(u64 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_decrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+
+static void ctr_crypt_final(struct des3_ede_x86_ctx *ctx,
+ struct blkcipher_walk *walk)
+{
+ u8 *ctrblk = walk->iv;
+ u8 keystream[DES3_EDE_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ des3_ede_enc_blk(ctx, keystream, ctrblk);
+ crypto_xor(keystream, src, nbytes);
+ memcpy(dst, keystream, nbytes);
+
+ crypto_inc(ctrblk, DES3_EDE_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ unsigned int bsize = DES3_EDE_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ __be64 *src = (__be64 *)walk->src.virt.addr;
+ __be64 *dst = (__be64 *)walk->dst.virt.addr;
+ u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
+ __be64 ctrblocks[3];
+
+ /* Process four block batch */
+ if (nbytes >= bsize * 3) {
+ do {
+ /* create ctrblks for parallel encrypt */
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+ ctrblocks[1] = cpu_to_be64(ctrblk++);
+ ctrblocks[2] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk_3way(ctx, (u8 *)ctrblocks,
+ (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+ dst[1] = src[1] ^ ctrblocks[1];
+ dst[2] = src[2] ^ ctrblocks[2];
+
+ src += 3;
+ dst += 3;
+ } while ((nbytes -= bsize * 3) >= bsize * 3);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ ctrblocks[0] = cpu_to_be64(ctrblk++);
+
+ des3_ede_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+
+ dst[0] = src[0] ^ ctrblocks[0];
+
+ src += 1;
+ dst += 1;
+ } while ((nbytes -= bsize) >= bsize);
+
+done:
+ *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, DES3_EDE_BLOCK_SIZE);
+
+ while ((nbytes = walk.nbytes) >= DES3_EDE_BLOCK_SIZE) {
+ nbytes = __ctr_crypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ if (walk.nbytes) {
+ ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+
+ return err;
+}
+
+static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 i, j, tmp;
+ int err;
+
+ /* Generate encryption context using generic implementation. */
+ err = __des3_ede_setkey(ctx->enc_expkey, &tfm->crt_flags, key, keylen);
+ if (err < 0)
+ return err;
+
+ /* Fix encryption context for this implementation and form decryption
+ * context. */
+ j = DES3_EDE_EXPKEY_WORDS - 2;
+ for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) {
+ tmp = ror32(ctx->enc_expkey[i + 1], 4);
+ ctx->enc_expkey[i + 1] = tmp;
+
+ ctx->dec_expkey[j + 0] = ctx->enc_expkey[i + 0];
+ ctx->dec_expkey[j + 1] = tmp;
+ }
+
+ return 0;
+}
+
+static struct crypto_alg des3_ede_algs[4] = { {
+ .cra_name = "des3_ede",
+ .cra_driver_name = "des3_ede-asm",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = DES3_EDE_KEY_SIZE,
+ .cia_max_keysize = DES3_EDE_KEY_SIZE,
+ .cia_setkey = des3_ede_x86_setkey,
+ .cia_encrypt = des3_ede_x86_encrypt,
+ .cia_decrypt = des3_ede_x86_decrypt,
+ }
+ }
+}, {
+ .cra_name = "ecb(des3_ede)",
+ .cra_driver_name = "ecb-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ },
+ },
+}, {
+ .cra_name = "cbc(des3_ede)",
+ .cra_driver_name = "cbc-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = DES3_EDE_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+ },
+}, {
+ .cra_name = "ctr(des3_ede)",
+ .cra_driver_name = "ctr-des3_ede-asm",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct des3_ede_x86_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = DES3_EDE_KEY_SIZE,
+ .max_keysize = DES3_EDE_KEY_SIZE,
+ .ivsize = DES3_EDE_BLOCK_SIZE,
+ .setkey = des3_ede_x86_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+ },
+} };
+
+static bool is_blacklisted_cpu(void)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return false;
+
+ if (boot_cpu_data.x86 == 0x0f) {
+ /*
+ * On Pentium 4, des3_ede-x86_64 is slower than generic C
+ * implementation because use of 64bit rotates (which are really
+ * slow on P4). Therefore blacklist P4s.
+ */
+ return true;
+ }
+
+ return false;
+}
+
+static int force;
+module_param(force, int, 0);
+MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
+
+static int __init des3_ede_x86_init(void)
+{
+ if (!force && is_blacklisted_cpu()) {
+ pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n");
+ return -ENODEV;
+ }
+
+ return crypto_register_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
+}
+
+static void __exit des3_ede_x86_fini(void)
+{
+ crypto_unregister_algs(des3_ede_algs, ARRAY_SIZE(des3_ede_algs));
+}
+
+module_init(des3_ede_x86_init);
+module_exit(des3_ede_x86_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
+MODULE_ALIAS("des3_ede");
+MODULE_ALIAS("des3_ede-asm");
+MODULE_ALIAS("des");
+MODULE_ALIAS("des-asm");
+MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index f30cd10293f..8626b03e83b 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
/* save number of bits */
bits[1] = cpu_to_be64(sctx->count[0] << 3);
- bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61;
+ bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
/* Pad out to 112 mod 128 and append length */
index = sctx->count[0] & 0x7f;
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 3ca9762e164..3bf000fab0a 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -5,6 +5,7 @@ genhdr-y += unistd_64.h
genhdr-y += unistd_x32.h
generic-y += clkdev.h
-generic-y += early_ioremap.h
generic-y += cputime.h
+generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
+generic-y += scatterlist.h
diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h
index 66873297e9f..1b010a859b8 100644
--- a/arch/x86/include/asm/acenv.h
+++ b/arch/x86/include/asm/acenv.h
@@ -18,8 +18,6 @@
#define ACPI_FLUSH_CPU_CACHE() wbinvd()
-#ifdef CONFIG_ACPI
-
int __acpi_acquire_global_lock(unsigned int *lock);
int __acpi_release_global_lock(unsigned int *lock);
@@ -44,6 +42,4 @@ int __acpi_release_global_lock(unsigned int *lock);
: "=r"(n_hi), "=r"(n_lo) \
: "0"(n_hi), "1"(n_lo))
-#endif
-
#endif /* _ASM_X86_ACENV_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index e06225eda63..0ab4f9fd268 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -121,6 +121,11 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
}
+static inline bool acpi_has_cpu_in_madt(void)
+{
+ return !!acpi_lapic;
+}
+
#else /* !CONFIG_ACPI */
#define acpi_lapic 0
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0a3f9c9f98d..473bdbee378 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -161,6 +161,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
: : "i" (0), ## input)
+/*
+ * This is similar to alternative_input. But it has two features and
+ * respective instructions.
+ *
+ * If CPU has feature2, newinstr2 is used.
+ * Otherwise, if CPU has feature1, newinstr1 is used.
+ * Otherwise, oldinstr is used.
+ */
+#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \
+ feature2, input...) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \
+ newinstr2, feature2) \
+ : : "i" (0), ## input)
+
/* Like alternative_input, but with a single output argument */
#define alternative_io(oldinstr, newinstr, feature, output, input...) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c31fded99cf..465b309af25 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
+ alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 5c7198cca5e..0f4460b5636 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -99,7 +99,7 @@
#if defined(CONFIG_X86_PPRO_FENCE)
/*
- * For either of these options x86 doesn't have a strong TSO memory
+ * For this option x86 doesn't have a strong TSO memory
* model and we should fall back to full barriers.
*/
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index d47786acb01..99c105d78b7 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -4,6 +4,8 @@
#include <linux/compiler.h>
#include <asm/alternative.h> /* Provides LOCK_PREFIX */
+#define __HAVE_ARCH_CMPXCHG 1
+
/*
* Non-existant functions to indicate usage errors at link time
* (or compile-time if the compiler implements __compiletime_error().
@@ -143,7 +145,6 @@ extern void __add_wrong_size(void)
# include <asm/cmpxchg_64.h>
#endif
-#ifdef __HAVE_ARCH_CMPXCHG
#define cmpxchg(ptr, old, new) \
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
@@ -152,7 +153,6 @@ extern void __add_wrong_size(void)
#define cmpxchg_local(ptr, old, new) \
__cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
-#endif
/*
* xadd() adds "inc" to "*ptr" and atomically returns the previous
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index f8bf2eecab8..f7e14292648 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -34,8 +34,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 value)
: "memory");
}
-#define __HAVE_ARCH_CMPXCHG 1
-
#ifdef CONFIG_X86_CMPXCHG64
#define cmpxchg64(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 614be87f1a9..1af94697aae 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -6,8 +6,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
*ptr = val;
}
-#define __HAVE_ARCH_CMPXCHG 1
-
#define cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index e265ff95d16..bb9b258d60e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -8,7 +8,7 @@
#include <asm/required-features.h>
#endif
-#define NCAPINTS 10 /* N 32-bit words worth of info */
+#define NCAPINTS 11 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -18,213 +18,218 @@
*/
/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */
+#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */
/* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM (0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
+#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */
/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
/* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
+#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */
+#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */
+#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */
/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
+#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */
/* Other features, Linux-defined mapping, word 3 */
/* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
+#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
/* cpu types for specific tunings: */
-#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
-#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
-#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
-#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
-#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS (3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY (3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC (3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
-#define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */
-#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU (3*32+29) /* "eagerfpu" Non lazy FPU restore */
-#define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */
+/* free, was #define X86_FEATURE_FXSAVE_LEAK ( 3*32+10) * "" FXSAVE leaks FOP/FIP/FOP */
+#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */
+#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */
+#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
+/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */
+/* free, was #define X86_FEATURE_CLFLUSH_MONITOR ( 3*32+25) * "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
+#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
+#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
+#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID (4*32+10) /* Context ID */
-#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */
-#define X86_FEATURE_AES (4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
+#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
+#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */
+#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
+#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */
+#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */
+#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */
/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */
+#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */
/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE (6*32+17) /* translation cache extension */
-#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
-#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB (6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_PERFCTR_L2 (6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */
+#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */
+#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */
+#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
/*
* Auxiliary flags: Linux defined - For features scattered in various
* CPUID levels like 0x6, 0xA etc, word 7
*/
-#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */
-#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */
-#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */
-#define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK (7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_IDA ( 7*32+ 0) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
+#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
+#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
/* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
-#define X86_FEATURE_NPT (8*32+ 5) /* AMD Nested Page Table support */
-#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
-#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
-#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
+#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_NPT ( 8*32+ 5) /* AMD Nested Page Table support */
+#define X86_FEATURE_LBRV ( 8*32+ 6) /* AMD LBR Virtualization support */
+#define X86_FEATURE_SVML ( 8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
+#define X86_FEATURE_NRIPS ( 8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR ( 8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN ( 8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID ( 8*32+11) /* AMD flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
-#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */
-#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */
+#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
+
+/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
+#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
/*
* BUG word(s)
@@ -234,8 +239,11 @@
#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */
#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */
#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* AMD Erratum 400 */
+#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
@@ -245,6 +253,12 @@
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
+/*
+ * In order to save room, we index into this array by doing
+ * X86_BUG_<name> - NCAPINTS*32.
+ */
+extern const char * const x86_bug_flags[NBUGINTS*32];
+
#define test_cpu_cap(c, bit) \
test_bit(bit, (unsigned long *)((c)->x86_capability))
@@ -301,7 +315,6 @@ extern const char * const x86_power_flags[32];
#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX)
#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
-#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
@@ -328,6 +341,7 @@ extern const char * const x86_power_flags[32];
#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT)
+#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES)
#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE)
#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
@@ -347,9 +361,6 @@ extern const char * const x86_power_flags[32];
#undef cpu_has_pae
#define cpu_has_pae ___BUG___
-#undef cpu_has_mp
-#define cpu_has_mp 1
-
#undef cpu_has_k6_mtrr
#define cpu_has_k6_mtrr 0
@@ -539,20 +550,20 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
#define static_cpu_has_safe(bit) boot_cpu_has(bit)
#endif
-#define cpu_has_bug(c, bit) cpu_has(c, (bit))
-#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
-#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit));
+#define cpu_has_bug(c, bit) cpu_has(c, (bit))
+#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit))
+#define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit))
-#define static_cpu_has_bug(bit) static_cpu_has((bit))
-#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
+#define static_cpu_has_bug(bit) static_cpu_has((bit))
+#define static_cpu_has_bug_safe(bit) static_cpu_has_safe((bit))
+#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
-#define MAX_CPU_FEATURES (NCAPINTS * 32)
-#define cpu_have_feature boot_cpu_has
+#define MAX_CPU_FEATURES (NCAPINTS * 32)
+#define cpu_have_feature boot_cpu_has
-#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
-#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
- boot_cpu_data.x86_model
+#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X"
+#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \
+ boot_cpu_data.x86_model
#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
-
#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
new file mode 100644
index 00000000000..f498411f250
--- /dev/null
+++ b/arch/x86/include/asm/crash.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_X86_CRASH_H
+#define _ASM_X86_CRASH_H
+
+int crash_load_segments(struct kimage *image);
+int crash_copy_backup_region(struct kimage *image);
+int crash_setup_memmap_entries(struct kimage *image,
+ struct boot_params *params);
+
+#endif /* _ASM_X86_CRASH_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 1eb5f6433ad..044a2fd3c5f 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -104,6 +104,8 @@ extern void __init runtime_code_page_mkexec(void);
extern void __init efi_runtime_mkexec(void);
extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
+extern int __init efi_reuse_config(u64 tables, int nr_tables);
+extern void efi_delete_dummy_variable(void);
struct efi_setup_data {
u64 fw_vendor;
@@ -156,6 +158,33 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
return EFI_SUCCESS;
}
#endif /* CONFIG_EFI_MIXED */
+
+
+/* arch specific definitions used by the stub code */
+
+struct efi_config {
+ u64 image_handle;
+ u64 table;
+ u64 allocate_pool;
+ u64 allocate_pages;
+ u64 get_memory_map;
+ u64 free_pool;
+ u64 free_pages;
+ u64 locate_handle;
+ u64 handle_protocol;
+ u64 exit_boot_services;
+ u64 text_output;
+ efi_status_t (*call)(unsigned long, ...);
+ bool is64;
+} __packed;
+
+extern struct efi_config *efi_early;
+
+#define efi_call_early(f, ...) \
+ efi_early->call(efi_early->f, __VA_ARGS__);
+
+extern bool efi_reboot_required(void);
+
#else
/*
* IF EFI is not configured, have the EFI calls return -ENOSYS.
@@ -168,6 +197,10 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
+static inline bool efi_reboot_required(void)
+{
+ return false;
+}
#endif /* CONFIG_EFI */
#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 115e3689cd5..412ececa00b 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -293,7 +293,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. "m" is a random variable that should be in L1 */
- if (unlikely(static_cpu_has_safe(X86_FEATURE_FXSAVE_LEAK))) {
+ if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
asm volatile(
"fnclex\n\t"
"emms\n\t"
@@ -508,9 +508,12 @@ static inline void user_fpu_begin(void)
static inline void __save_fpu(struct task_struct *tsk)
{
- if (use_xsave())
- xsave_state(&tsk->thread.fpu.state->xsave, -1);
- else
+ if (use_xsave()) {
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
+ else
+ xsave_state(&tsk->thread.fpu.state->xsave, -1);
+ } else
fpu_fxsave(&tsk->thread.fpu);
}
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 0525a8bdf65..e1f7fecaa7d 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -68,6 +68,8 @@ struct dyn_arch_ftrace {
int ftrace_int3_handler(struct pt_regs *regs);
+#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
+
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index cb6cfcd034c..a80cbb88ea9 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector);
extern void init_ISA_irqs(void);
#ifdef CONFIG_X86_LOCAL_APIC
-void arch_trigger_all_cpu_backtrace(void);
+void arch_trigger_all_cpu_backtrace(bool);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
#endif
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index bba3cf88e62..0a8b519226b 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -129,7 +129,7 @@ static inline notrace unsigned long arch_local_irq_save(void)
#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
-#define INTERRUPT_RETURN iretq
+#define INTERRUPT_RETURN jmp native_iret
#define USERGS_SYSRET64 \
swapgs; \
sysretq;
diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h
new file mode 100644
index 00000000000..d1b5d194e31
--- /dev/null
+++ b/arch/x86/include/asm/kexec-bzimage64.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_KEXEC_BZIMAGE64_H
+#define _ASM_KEXEC_BZIMAGE64_H
+
+extern struct kexec_file_ops kexec_bzImage64_ops;
+
+#endif /* _ASM_KEXE_BZIMAGE64_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 17483a492f1..d2434c1cad0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -23,6 +23,9 @@
#include <asm/page.h>
#include <asm/ptrace.h>
+#include <asm/bootparam.h>
+
+struct kimage;
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
@@ -61,6 +64,10 @@
# define KEXEC_ARCH KEXEC_ARCH_X86_64
#endif
+/* Memory to backup during crash kdump */
+#define KEXEC_BACKUP_SRC_START (0UL)
+#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
+
/*
* CPU does not save ss and sp on stack if execution is already
* running in kernel mode at the time of NMI occurrence. This code
@@ -160,6 +167,44 @@ struct kimage_arch {
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ /* Details of backup region */
+ unsigned long backup_src_start;
+ unsigned long backup_src_sz;
+
+ /* Physical address of backup segment */
+ unsigned long backup_load_addr;
+
+ /* Core ELF header buffer */
+ void *elf_headers;
+ unsigned long elf_headers_sz;
+ unsigned long elf_load_addr;
+};
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+/*
+ * Number of elements and order of elements in this structure should match
+ * with the ones in arch/x86/purgatory/entry64.S. If you make a change here
+ * make an appropriate change in purgatory too.
+ */
+struct kexec_entry64_regs {
+ uint64_t rax;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbx;
+ uint64_t rsp;
+ uint64_t rbp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+ uint64_t rip;
};
#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a04fe4eb237..eb181178fe0 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -37,6 +37,7 @@ struct x86_instruction_info {
u8 modrm_reg; /* index of register used */
u8 modrm_rm; /* rm part of modrm */
u64 src_val; /* value of source operand */
+ u64 dst_val; /* value of destination operand */
u8 src_bytes; /* size of source operand */
u8 dst_bytes; /* size of destination operand */
u8 ad_bytes; /* size of src/dst address */
@@ -194,6 +195,7 @@ struct x86_emulate_ops {
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
void (*halt)(struct x86_emulate_ctxt *ctxt);
void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
@@ -231,7 +233,7 @@ struct operand {
union {
unsigned long val;
u64 val64;
- char valptr[sizeof(unsigned long) + 2];
+ char valptr[sizeof(sse128_t)];
sse128_t vec_val;
u64 mm_val;
void *data;
@@ -240,8 +242,8 @@ struct operand {
struct fetch_cache {
u8 data[15];
- unsigned long start;
- unsigned long end;
+ u8 *ptr;
+ u8 *end;
};
struct read_cache {
@@ -286,30 +288,36 @@ struct x86_emulate_ctxt {
u8 opcode_len;
u8 b;
u8 intercept;
- u8 lock_prefix;
- u8 rep_prefix;
u8 op_bytes;
u8 ad_bytes;
- u8 rex_prefix;
struct operand src;
struct operand src2;
struct operand dst;
- bool has_seg_override;
- u8 seg_override;
- u64 d;
int (*execute)(struct x86_emulate_ctxt *ctxt);
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
+ /*
+ * The following six fields are cleared together,
+ * the rest are initialized unconditionally in x86_decode_insn
+ * or elsewhere
+ */
+ bool rip_relative;
+ u8 rex_prefix;
+ u8 lock_prefix;
+ u8 rep_prefix;
+ /* bitmaps of registers in _regs[] that can be read */
+ u32 regs_valid;
+ /* bitmaps of registers in _regs[] that have been written */
+ u32 regs_dirty;
/* modrm */
u8 modrm;
u8 modrm_mod;
u8 modrm_reg;
u8 modrm_rm;
u8 modrm_seg;
- bool rip_relative;
+ u8 seg_override;
+ u64 d;
unsigned long _eip;
struct operand memop;
- u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */
- u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */
/* Fields above regs are cleared together. */
unsigned long _regs[NR_VCPU_REGS];
struct operand *memopp;
@@ -407,6 +415,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_OK 0
#define EMULATION_RESTART 1
#define EMULATION_INTERCEPTED 2
+void init_decode_cache(struct x86_emulate_ctxt *ctxt);
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
u16 tss_selector, int idt_index, int reason,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 49314155b66..572460175ba 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -95,7 +95,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 80
#define KVM_NR_FIXED_MTRR_REGION 88
-#define KVM_NR_VAR_MTRR 8
+#define KVM_NR_VAR_MTRR 10
#define ASYNC_PF_PER_VCPU 64
@@ -152,14 +152,16 @@ enum {
#define DR6_BD (1 << 13)
#define DR6_BS (1 << 14)
-#define DR6_FIXED_1 0xffff0ff0
-#define DR6_VOLATILE 0x0000e00f
+#define DR6_RTM (1 << 16)
+#define DR6_FIXED_1 0xfffe0ff0
+#define DR6_INIT 0xffff0ff0
+#define DR6_VOLATILE 0x0001e00f
#define DR7_BP_EN_MASK 0x000000ff
#define DR7_GE (1 << 9)
#define DR7_GD (1 << 13)
#define DR7_FIXED_1 0x00000400
-#define DR7_VOLATILE 0xffff23ff
+#define DR7_VOLATILE 0xffff2bff
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
@@ -448,7 +450,7 @@ struct kvm_vcpu_arch {
u64 tsc_offset_adjustment;
u64 this_tsc_nsec;
u64 this_tsc_write;
- u8 this_tsc_generation;
+ u64 this_tsc_generation;
bool tsc_catchup;
bool tsc_always_catchup;
s8 virtual_tsc_shift;
@@ -461,7 +463,7 @@ struct kvm_vcpu_arch {
bool nmi_injected; /* Trying to inject an NMI this entry */
struct mtrr_state_type mtrr_state;
- u32 pat;
+ u64 pat;
unsigned switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
@@ -591,7 +593,7 @@ struct kvm_arch {
u64 cur_tsc_nsec;
u64 cur_tsc_write;
u64 cur_tsc_offset;
- u8 cur_tsc_generation;
+ u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
spinlock_t pvclock_gtod_sync_lock;
@@ -717,7 +719,7 @@ struct kvm_x86_ops {
int (*handle_exit)(struct kvm_vcpu *vcpu);
void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
- u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+ u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
unsigned char *hypercall_addr);
void (*set_irq)(struct kvm_vcpu *vcpu);
@@ -1070,6 +1072,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc);
int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index a55c7efcc4e..0f555cc3198 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -13,7 +13,7 @@
#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
#endif
-#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
+#if defined(CONFIG_X86_32)
/*
* This lock provides nmi access to the CMOS/RTC registers. It has some
* special properties. It is owned by a CPU and stores the index register
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index be12c534fd5..166af2a8e86 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -3,6 +3,10 @@
#include <asm/desc.h>
#include <linux/atomic.h>
+#include <linux/mm_types.h>
+
+#include <trace/events/tlb.h>
+
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
@@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Re-load page tables */
load_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
/* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* to make sure to use no freed page tables.
*/
load_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_LDT_nolock(&next->context);
}
}
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
index 0208c3c2cbc..85e6cda45a0 100644
--- a/arch/x86/include/asm/mutex_32.h
+++ b/arch/x86/include/asm/mutex_32.h
@@ -100,23 +100,11 @@ do { \
static inline int __mutex_fastpath_trylock(atomic_t *count,
int (*fail_fn)(atomic_t *))
{
- /*
- * We have two variants here. The cmpxchg based one is the best one
- * because it never induce a false contention state. It is included
- * here because architectures using the inc/dec algorithms over the
- * xchg ones are much more likely to support cmpxchg natively.
- *
- * If not we fall back to the spinlock based variant - that is
- * just as efficient (and simpler) as a 'destructive' probing of
- * the mutex state would be.
- */
-#ifdef __HAVE_ARCH_CMPXCHG
+ /* cmpxchg because it never induces a false contention state. */
if (likely(atomic_cmpxchg(count, 1, 0) == 1))
return 1;
+
return 0;
-#else
- return fail_fn(count);
-#endif
}
#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5f96f..a1410db38a1 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -43,7 +43,7 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (!current_set_polling_and_test()) {
- if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+ if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
mb();
clflush((void *)&current_thread_info()->flags);
mb();
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 775873d3be5..802dde30c92 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -70,7 +70,6 @@ extern bool __virt_addr_valid(unsigned long kaddr);
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
-#define __HAVE_ARCH_GATE_AREA 1
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 0f1ddee6a0c..f408caf7343 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -39,4 +39,6 @@ void copy_page(void *to, void *from);
#endif /* !__ASSEMBLY__ */
+#define __HAVE_ARCH_GATE_AREA 1
+
#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 851bcdc5db0..fd472181a1d 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -52,10 +52,9 @@
* Compared to the generic __my_cpu_offset version, the following
* saves one instruction and avoids clobbering a temp register.
*/
-#define raw_cpu_ptr(ptr) \
+#define arch_raw_cpu_ptr(ptr) \
({ \
unsigned long tcp_ptr__; \
- __verify_pcpu_ptr(ptr); \
asm volatile("add " __percpu_arg(1) ", %0" \
: "=r" (tcp_ptr__) \
: "m" (this_cpu_off), "0" (ptr)); \
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
new file mode 100644
index 00000000000..0a4e140315b
--- /dev/null
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -0,0 +1,78 @@
+/*
+ * platform_sst_audio.h: sst audio platform data header file
+ *
+ * Copyright (C) 2012-14 Intel Corporation
+ * Author: Jeeja KP <jeeja.kp@intel.com>
+ * Omair Mohammed Abdullah <omair.m.abdullah@intel.com>
+ * Vinod Koul ,vinod.koul@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _PLATFORM_SST_AUDIO_H_
+#define _PLATFORM_SST_AUDIO_H_
+
+#include <linux/sfi.h>
+
+enum sst_audio_task_id_mrfld {
+ SST_TASK_ID_NONE = 0,
+ SST_TASK_ID_SBA = 1,
+ SST_TASK_ID_MEDIA = 3,
+ SST_TASK_ID_MAX = SST_TASK_ID_MEDIA,
+};
+
+/* Device IDs for Merrifield are Pipe IDs,
+ * ref: DSP spec v0.75 */
+enum sst_audio_device_id_mrfld {
+ /* Output pipeline IDs */
+ PIPE_ID_OUT_START = 0x0,
+ PIPE_CODEC_OUT0 = 0x2,
+ PIPE_CODEC_OUT1 = 0x3,
+ PIPE_SPROT_LOOP_OUT = 0x4,
+ PIPE_MEDIA_LOOP1_OUT = 0x5,
+ PIPE_MEDIA_LOOP2_OUT = 0x6,
+ PIPE_VOIP_OUT = 0xC,
+ PIPE_PCM0_OUT = 0xD,
+ PIPE_PCM1_OUT = 0xE,
+ PIPE_PCM2_OUT = 0xF,
+ PIPE_MEDIA0_OUT = 0x12,
+ PIPE_MEDIA1_OUT = 0x13,
+/* Input Pipeline IDs */
+ PIPE_ID_IN_START = 0x80,
+ PIPE_CODEC_IN0 = 0x82,
+ PIPE_CODEC_IN1 = 0x83,
+ PIPE_SPROT_LOOP_IN = 0x84,
+ PIPE_MEDIA_LOOP1_IN = 0x85,
+ PIPE_MEDIA_LOOP2_IN = 0x86,
+ PIPE_VOIP_IN = 0x8C,
+ PIPE_PCM0_IN = 0x8D,
+ PIPE_PCM1_IN = 0x8E,
+ PIPE_MEDIA0_IN = 0x8F,
+ PIPE_MEDIA1_IN = 0x90,
+ PIPE_MEDIA2_IN = 0x91,
+ PIPE_RSVD = 0xFF,
+};
+
+/* The stream map for each platform consists of an array of the below
+ * stream map structure.
+ */
+struct sst_dev_stream_map {
+ u8 dev_num; /* device id */
+ u8 subdev_num; /* substream */
+ u8 direction;
+ u8 device_id; /* fw id */
+ u8 task_id; /* fw task */
+ u8 status;
+};
+
+struct sst_platform_data {
+ /* Intel software platform id*/
+ struct sst_dev_stream_map *pdev_strm_map;
+ unsigned int strm_map_size;
+};
+
+int add_sst_platform_device(void);
+#endif
+
diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
new file mode 100644
index 00000000000..fc7a17c05d3
--- /dev/null
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -0,0 +1,107 @@
+/*
+ * Intel Atom SOC Power Management Controller Header File
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef PMC_ATOM_H
+#define PMC_ATOM_H
+
+/* ValleyView Power Control Unit PCI Device ID */
+#define PCI_DEVICE_ID_VLV_PMC 0x0F1C
+
+/* PMC Memory mapped IO registers */
+#define PMC_BASE_ADDR_OFFSET 0x44
+#define PMC_BASE_ADDR_MASK 0xFFFFFE00
+#define PMC_MMIO_REG_LEN 0x100
+#define PMC_REG_BIT_WIDTH 32
+
+/* BIOS uses FUNC_DIS to disable specific function */
+#define PMC_FUNC_DIS 0x34
+#define PMC_FUNC_DIS_2 0x38
+
+/* S0ix wake event control */
+#define PMC_S0IX_WAKE_EN 0x3C
+
+#define BIT_LPC_CLOCK_RUN BIT(4)
+#define BIT_SHARED_IRQ_GPSC BIT(5)
+#define BIT_ORED_DEDICATED_IRQ_GPSS BIT(18)
+#define BIT_ORED_DEDICATED_IRQ_GPSC BIT(19)
+#define BIT_SHARED_IRQ_GPSS BIT(20)
+
+#define PMC_WAKE_EN_SETTING ~(BIT_LPC_CLOCK_RUN | \
+ BIT_SHARED_IRQ_GPSC | \
+ BIT_ORED_DEDICATED_IRQ_GPSS | \
+ BIT_ORED_DEDICATED_IRQ_GPSC | \
+ BIT_SHARED_IRQ_GPSS)
+
+/* The timers acumulate time spent in sleep state */
+#define PMC_S0IR_TMR 0x80
+#define PMC_S0I1_TMR 0x84
+#define PMC_S0I2_TMR 0x88
+#define PMC_S0I3_TMR 0x8C
+#define PMC_S0_TMR 0x90
+/* Sleep state counter is in units of of 32us */
+#define PMC_TMR_SHIFT 5
+
+/* These registers reflect D3 status of functions */
+#define PMC_D3_STS_0 0xA0
+
+#define BIT_LPSS1_F0_DMA BIT(0)
+#define BIT_LPSS1_F1_PWM1 BIT(1)
+#define BIT_LPSS1_F2_PWM2 BIT(2)
+#define BIT_LPSS1_F3_HSUART1 BIT(3)
+#define BIT_LPSS1_F4_HSUART2 BIT(4)
+#define BIT_LPSS1_F5_SPI BIT(5)
+#define BIT_LPSS1_F6_XXX BIT(6)
+#define BIT_LPSS1_F7_XXX BIT(7)
+#define BIT_SCC_EMMC BIT(8)
+#define BIT_SCC_SDIO BIT(9)
+#define BIT_SCC_SDCARD BIT(10)
+#define BIT_SCC_MIPI BIT(11)
+#define BIT_HDA BIT(12)
+#define BIT_LPE BIT(13)
+#define BIT_OTG BIT(14)
+#define BIT_USH BIT(15)
+#define BIT_GBE BIT(16)
+#define BIT_SATA BIT(17)
+#define BIT_USB_EHCI BIT(18)
+#define BIT_SEC BIT(19)
+#define BIT_PCIE_PORT0 BIT(20)
+#define BIT_PCIE_PORT1 BIT(21)
+#define BIT_PCIE_PORT2 BIT(22)
+#define BIT_PCIE_PORT3 BIT(23)
+#define BIT_LPSS2_F0_DMA BIT(24)
+#define BIT_LPSS2_F1_I2C1 BIT(25)
+#define BIT_LPSS2_F2_I2C2 BIT(26)
+#define BIT_LPSS2_F3_I2C3 BIT(27)
+#define BIT_LPSS2_F4_I2C4 BIT(28)
+#define BIT_LPSS2_F5_I2C5 BIT(29)
+#define BIT_LPSS2_F6_I2C6 BIT(30)
+#define BIT_LPSS2_F7_I2C7 BIT(31)
+
+#define PMC_D3_STS_1 0xA4
+#define BIT_SMB BIT(0)
+#define BIT_OTG_SS_PHY BIT(1)
+#define BIT_USH_SS_PHY BIT(2)
+#define BIT_DFX BIT(3)
+
+/* PMC I/O Registers */
+#define ACPI_BASE_ADDR_OFFSET 0x40
+#define ACPI_BASE_ADDR_MASK 0xFFFFFE00
+#define ACPI_MMIO_REG_LEN 0x100
+
+#define PM1_CNT 0x4
+#define SLEEP_TYPE_MASK 0xFFFFECFF
+#define SLEEP_TYPE_S5 0x1C00
+#define SLEEP_ENABLE 0x2000
+#endif /* PMC_ATOM_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a4ea02351f4..eb71ec79473 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO];
extern u16 __read_mostly tlb_lld_2m[NR_INFO];
extern u16 __read_mostly tlb_lld_4m[NR_INFO];
extern u16 __read_mostly tlb_lld_1g[NR_INFO];
-extern s8 __read_mostly tlb_flushall_shift;
/*
* CPU type and hardware bug flags. Kept separately for each CPU.
@@ -386,8 +385,8 @@ struct bndcsr_struct {
struct xsave_hdr_struct {
u64 xstate_bv;
- u64 reserved1[2];
- u64 reserved2[5];
+ u64 xcomp_bv;
+ u64 reserved[6];
} __attribute__((packed));
struct xsave_struct {
@@ -696,6 +695,8 @@ static inline void cpu_relax(void)
rep_nop();
}
+#define cpu_relax_lowlatency() cpu_relax()
+
/* Stop speculative execution and prefetching of modified code. */
static inline void sync_core(void)
{
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14fd6fd75a1..6205f0c434d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -231,6 +231,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
#define ARCH_HAS_USER_SINGLE_STEP_INFO
+/*
+ * When hitting ptrace_stop(), we cannot return using SYSRET because
+ * that does not restore the full CPU state, only a minimal set. The
+ * ptracer can change arbitrary register values, which is usually okay
+ * because the usual ptrace stops run off the signal delivery path which
+ * forces IRET; however, ptrace_event() stops happen in arbitrary places
+ * in the kernel and don't force IRET path.
+ *
+ * So force IRET path after a ptrace stop.
+ */
+#define arch_ptrace_stop_needed(code, info) \
+({ \
+ set_thread_flag(TIF_NOTIFY_RESUME); \
+ false; \
+})
+
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
index 70f46f07f94..ae0e241e228 100644
--- a/arch/x86/include/asm/qrwlock.h
+++ b/arch/x86/include/asm/qrwlock.h
@@ -3,7 +3,7 @@
#include <asm-generic/qrwlock_types.h>
-#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
+#ifndef CONFIG_X86_PPRO_FENCE
#define queue_write_unlock queue_write_unlock
static inline void queue_write_unlock(struct qrwlock *lock)
{
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
deleted file mode 100644
index 4240878b9d7..00000000000
--- a/arch/x86/include/asm/scatterlist.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _ASM_X86_SCATTERLIST_H
-#define _ASM_X86_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 0b46ef261c7..2d60a7813df 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -73,6 +73,7 @@
#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
+/* assuming UV3 is the same */
#define BAU_MISC_CONTROL_MULT_MASK 3
@@ -93,6 +94,8 @@
#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
+#define PREFETCH_HINT_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT
+#define SB_STATUS_SHFT UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
#define write_gmmr uv_write_global_mmr64
#define write_lmmr uv_write_local_mmr
#define read_lmmr uv_read_local_mmr
@@ -322,8 +325,9 @@ struct uv1_bau_msg_header {
/*
* UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
* see figure 9-2 of harp_sys.pdf
+ * assuming UV3 is the same
*/
-struct uv2_bau_msg_header {
+struct uv2_3_bau_msg_header {
unsigned int base_dest_nasid:15; /* nasid of the first bit */
/* bits 14:0 */ /* in uvhub map */
unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */
@@ -395,7 +399,7 @@ struct bau_desc {
*/
union bau_msg_header {
struct uv1_bau_msg_header uv1_hdr;
- struct uv2_bau_msg_header uv2_hdr;
+ struct uv2_3_bau_msg_header uv2_3_hdr;
} header;
struct bau_msg_payload payload;
@@ -631,11 +635,6 @@ struct bau_control {
struct hub_and_pnode *thp;
};
-static inline unsigned long read_mmr_uv2_status(void)
-{
- return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
-}
-
static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
{
write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
@@ -760,7 +759,11 @@ static inline int atomic_read_short(const struct atomic_short *v)
*/
static inline int atom_asr(short i, struct atomic_short *v)
{
- return i + xadd(&v->counter, i);
+ short __i = i;
+ asm volatile(LOCK_PREFIX "xaddw %0, %1"
+ : "+r" (i), "+m" (v->counter)
+ : : "memory");
+ return i + __i;
}
/*
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 30be253dd28..8021bd28c0f 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -18,15 +18,15 @@ struct vdso_image {
unsigned long alt, alt_len;
- unsigned long sym_end_mapping; /* Total size of the mapping */
-
- unsigned long sym_vvar_page;
- unsigned long sym_hpet_page;
- unsigned long sym_VDSO32_NOTE_MASK;
- unsigned long sym___kernel_sigreturn;
- unsigned long sym___kernel_rt_sigreturn;
- unsigned long sym___kernel_vsyscall;
- unsigned long sym_VDSO32_SYSENTER_RETURN;
+ long sym_vvar_start; /* Negative offset to the vvar area */
+
+ long sym_vvar_page;
+ long sym_hpet_page;
+ long sym_VDSO32_NOTE_MASK;
+ long sym___kernel_sigreturn;
+ long sym___kernel_rt_sigreturn;
+ long sym___kernel_vsyscall;
+ long sym_VDSO32_SYSENTER_RETURN;
};
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
index 44282fbf7bf..c4b9dc2f67c 100644
--- a/arch/x86/include/asm/vga.h
+++ b/arch/x86/include/asm/vga.h
@@ -17,10 +17,4 @@
#define vga_readb(x) (*(x))
#define vga_writeb(x, y) (*(y) = (x))
-#ifdef CONFIG_FB_EFI
-#define __ARCH_HAS_VGA_DEFAULT_DEVICE
-extern struct pci_dev *vga_default_device(void);
-extern void vga_set_default_device(struct pci_dev *pdev);
-#endif
-
#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 7004d21e621..bcbfade26d8 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -51,6 +51,9 @@
#define CPU_BASED_MONITOR_EXITING 0x20000000
#define CPU_BASED_PAUSE_EXITING 0x40000000
#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172
+
/*
* Definitions of Secondary Processor-Based VM-Execution Controls.
*/
@@ -76,7 +79,7 @@
#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
-#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
@@ -89,7 +92,7 @@
#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
-#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index d949ef28c48..7e7a79ada65 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -52,24 +52,170 @@ extern void xsave_init(void);
extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
extern int init_fpu(struct task_struct *child);
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+#define xstate_fault ".section .fixup,\"ax\"\n" \
+ "3: movl $-1,%[err]\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [err] "=r" (err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
{
- int err;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ asm volatile("1:"XSAVES"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+ else
+ asm volatile("1:"XSAVE"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile(xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
- asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : [err] "=r" (err)
- : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ asm volatile("1:"XRSTORS"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+ else
+ asm volatile("1:"XRSTOR"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile(xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline int xsave_state(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ /*
+ * If xsaves is enabled, xsaves replaces xsaveopt because
+ * it supports compact format and supervisor states in addition to
+ * modified optimization in xsaveopt.
+ *
+ * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+ * because xsaveopt supports modified optimization which is not
+ * supported by xsave.
+ *
+ * If none of xsaves and xsaveopt is enabled, use xsave.
+ */
+ alternative_input_2(
+ "1:"XSAVE,
+ "1:"XSAVEOPT,
+ X86_FEATURE_XSAVEOPT,
+ "1:"XSAVES,
+ X86_FEATURE_XSAVES,
+ [fx] "D" (fx), "a" (lmask), "d" (hmask) :
+ "memory");
+ asm volatile("2:\n\t"
+ xstate_fault
+ : "0" (0)
: "memory");
return err;
}
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
+{
+ int err = 0;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ /*
+ * Use xrstors to restore context if it is enabled. xrstors supports
+ * compacted format of xsave area which is not supported by xrstor.
+ */
+ alternative_input(
+ "1: " XRSTOR,
+ "1: " XRSTORS,
+ X86_FEATURE_XSAVES,
+ "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile("2:\n"
+ xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * Save xstate context for old process during context switch.
+ */
+static inline void fpu_xsave(struct fpu *fpu)
+{
+ xsave_state(&fpu->state->xsave, -1);
+}
+
+/*
+ * Restore xstate context for new process during context switch.
+ */
+static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+{
+ return xrstor_state(fx, -1);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
static inline int xsave_user(struct xsave_struct __user *buf)
{
int err;
@@ -83,69 +229,34 @@ static inline int xsave_user(struct xsave_struct __user *buf)
return -EFAULT;
__asm__ __volatile__(ASM_STAC "\n"
- "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
+ "1:"XSAVE"\n"
"2: " ASM_CLAC "\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b,3b)
- : [err] "=r" (err)
+ xstate_fault
: "D" (buf), "a" (-1), "d" (-1), "0" (0)
: "memory");
return err;
}
+/*
+ * Restore xstate from user space xsave area.
+ */
static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
{
- int err;
+ int err = 0;
struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
u32 lmask = mask;
u32 hmask = mask >> 32;
__asm__ __volatile__(ASM_STAC "\n"
- "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
+ "1:"XRSTOR"\n"
"2: " ASM_CLAC "\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b,3b)
- : [err] "=r" (err)
+ xstate_fault
: "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
: "memory"); /* memory required? */
return err;
}
-static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
-
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
-}
-
-static inline void xsave_state(struct xsave_struct *fx, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
+void setup_xstate_comp(void);
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
-}
-
-static inline void fpu_xsave(struct fpu *fpu)
-{
- /* This, however, we can work around by forcing the compiler to select
- an addressing mode that doesn't require extended registers. */
- alternative_input(
- ".byte " REX_PREFIX "0x0f,0xae,0x27",
- ".byte " REX_PREFIX "0x0f,0xae,0x37",
- X86_FEATURE_XSAVEOPT,
- [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
- "memory");
-}
#endif
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index 09409c44f9a..3dec769cadf 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -22,6 +22,7 @@ header-y += ipcbuf.h
header-y += ist.h
header-y += kvm.h
header-y += kvm_para.h
+header-y += kvm_perf.h
header-y += ldt.h
header-y += mce.h
header-y += mman.h
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d3a87780c70..d7dcef58aef 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -23,7 +23,10 @@
#define GP_VECTOR 13
#define PF_VECTOR 14
#define MF_VECTOR 16
+#define AC_VECTOR 17
#define MC_VECTOR 18
+#define XM_VECTOR 19
+#define VE_VECTOR 20
/* Select x86 specific features in <linux/kvm.h> */
#define __KVM_HAVE_PIT
diff --git a/arch/x86/include/uapi/asm/kvm_perf.h b/arch/x86/include/uapi/asm/kvm_perf.h
new file mode 100644
index 00000000000..3bb964f88aa
--- /dev/null
+++ b/arch/x86/include/uapi/asm/kvm_perf.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_KVM_PERF_H
+#define _ASM_X86_KVM_PERF_H
+
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
+
+#define DECODE_STR_LEN 20
+
+#define VCPU_ID "vcpu_id"
+
+#define KVM_ENTRY_TRACE "kvm:kvm_entry"
+#define KVM_EXIT_TRACE "kvm:kvm_exit"
+#define KVM_EXIT_REASON "exit_reason"
+
+#endif /* _ASM_X86_KVM_PERF_H */
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index fcf2b3ae1bf..eac9e92fe18 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -297,6 +297,8 @@
#define MSR_IA32_TSC_ADJUST 0x0000003b
#define MSR_IA32_BNDCFGS 0x00000d90
+#define MSR_IA32_XSS 0x00000da0
+
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
@@ -558,6 +560,7 @@
/* VMX_BASIC bits and bitmasks */
#define VMX_BASIC_VMCS_SIZE_SHIFT 32
+#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
#define VMX_BASIC_64 0x0001000000000000LLU
#define VMX_BASIC_MEM_TYPE_SHIFT 50
#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 047f9ff2e36..b5ea75c4a4b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
+obj-$(CONFIG_PMC_ATOM) += pmc_atom.o
###
# 64 bit specific files
@@ -117,4 +118,5 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
+ obj-$(CONFIG_KEXEC) += kexec-bzimage64.o
endif
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 163b2258147..3242e591fa8 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,6 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_APEI) += apei.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
new file mode 100644
index 00000000000..c280df6b2aa
--- /dev/null
+++ b/arch/x86/kernel/acpi/apei.c
@@ -0,0 +1,62 @@
+/*
+ * Arch-specific APEI-related functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <acpi/apei.h>
+
+#include <asm/mce.h>
+#include <asm/tlbflush.h>
+
+int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+ return 0;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
+ !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
+void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
+{
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev, mem_err);
+#endif
+}
+
+void arch_apei_flush_tlb_one(unsigned long addr)
+{
+ __flush_tlb_one(addr);
+}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8c28023924b..b436fc735aa 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -76,10 +76,6 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
-#ifndef __HAVE_ARCH_CMPXCHG
-#warning ACPI uses CMPXCHG, i486 and later hardware
-#endif
-
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c3fcb5de508..6a1e71bde32 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
{
int i;
+ int cpu = get_cpu();
- if (test_and_set_bit(0, &backtrace_flag))
+ if (test_and_set_bit(0, &backtrace_flag)) {
/*
* If there is already a trigger_all_cpu_backtrace() in progress
* (backtrace_flag == 1), don't output double cpu dump infos.
*/
+ put_cpu();
return;
+ }
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+ if (!include_self)
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
+ if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+ pr_info("sending NMI to %s CPUs:\n",
+ (include_self ? "all" : "other"));
+ apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+ }
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
+ touch_softlockup_watchdog();
}
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
+ put_cpu();
}
static int
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f3a1f04ed4c..58487445141 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -841,7 +841,6 @@ static int apm_do_idle(void)
u32 eax;
u8 ret = 0;
int idled = 0;
- int polling;
int err = 0;
if (!need_resched()) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ce8b8ff0e0e..60e5497681f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
+#include <asm/smp.h>
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
@@ -50,7 +51,6 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
return wrmsr_safe_regs(gprs);
}
-#ifdef CONFIG_X86_32
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
@@ -70,6 +70,7 @@ __asm__(".globl vide\n\t.align 4\nvide: ret");
static void init_amd_k5(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
* of the Elan at 0x000df000. Unfortuantly, one of the Linux
@@ -83,11 +84,12 @@ static void init_amd_k5(struct cpuinfo_x86 *c)
if (inl(CBAR) & CBAR_ENB)
outl(0 | CBAR_KEY, CBAR);
}
+#endif
}
-
static void init_amd_k6(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
u32 l, h;
int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
@@ -176,10 +178,44 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
/* placeholder for any needed mods */
return;
}
+#endif
}
-static void amd_k7_smp_check(struct cpuinfo_x86 *c)
+static void init_amd_k7(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ printk(KERN_INFO
+ "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
+ set_cpu_cap(c, X86_FEATURE_K7);
+
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -207,7 +243,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
((c->x86_model == 7) && (c->x86_mask >= 1)) ||
(c->x86_model > 7))
- if (cpu_has_mp)
+ if (cpu_has(c, X86_FEATURE_MP))
return;
/* If we get here, not a certified SMP capable AMD system. */
@@ -219,45 +255,8 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
-}
-
-static void init_amd_k7(struct cpuinfo_x86 *c)
-{
- u32 l, h;
-
- /*
- * Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
- * If the BIOS didn't enable it already, enable it here.
- */
- if (c->x86_model >= 6 && c->x86_model <= 10) {
- if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- msr_clear_bit(MSR_K7_HWCR, 15);
- set_cpu_cap(c, X86_FEATURE_XMM);
- }
- }
-
- /*
- * It's been determined by AMD that Athlons since model 8 stepping 1
- * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
- * As per AMD technical note 27212 0.2
- */
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
- rdmsr(MSR_K7_CLK_CTL, l, h);
- if ((l & 0xfff00000) != 0x20000000) {
- printk(KERN_INFO
- "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
- l, ((l & 0x000fffff)|0x20000000));
- wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
- }
- }
-
- set_cpu_cap(c, X86_FEATURE_K7);
-
- amd_k7_smp_check(c);
-}
#endif
+}
#ifdef CONFIG_NUMA
/*
@@ -446,6 +445,26 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c)
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
+
+#ifdef CONFIG_X86_64
+ if (c->x86 >= 0xf) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
+ printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+ }
+#endif
+
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
if (c->x86 > 0x10 ||
@@ -515,101 +534,74 @@ static const int amd_erratum_383[];
static const int amd_erratum_400[];
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
-static void init_amd(struct cpuinfo_x86 *c)
+static void init_amd_k8(struct cpuinfo_x86 *c)
{
- u32 dummy;
- unsigned long long value;
+ u32 level;
+ u64 value;
-#ifdef CONFIG_SMP
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 0xf)
- msr_set_bit(MSR_K7_HWCR, 6);
-#endif
-
- early_init_amd(c);
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
*/
- clear_cpu_cap(c, 0*32+31);
-
-#ifdef CONFIG_X86_64
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- if (c->x86 == 0xf) {
- u32 level;
-
- level = cpuid_eax(1);
- if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- /*
- * Some BIOSes incorrectly force this feature, but only K8
- * revision D (model = 0x14) and later actually support it.
- * (AMD Erratum #110, docId: 25759).
- */
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
- clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &value)) {
- value &= ~(1ULL << 32);
- wrmsrl_amd_safe(0xc001100d, value);
- }
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrl_amd_safe(0xc001100d, value);
}
-
}
- if (c->x86 >= 0x10)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- /* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
-#else
+ if (!c->x86_model_id[0])
+ strcpy(c->x86_model_id, "Hammer");
+}
+
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+#endif
/*
- * FIXME: We should handle the K5 here. Set up the write
- * range and also turn on MSR 83 bits 4 and 31 (write alloc,
- * no bus pipeline)
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
- switch (c->x86) {
- case 4:
- init_amd_k5(c);
- break;
- case 5:
- init_amd_k6(c);
- break;
- case 6: /* An Athlon/Duron */
- init_amd_k7(c);
- break;
- }
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- /* K6s reports MCEs but don't actually have all the MSRs */
- if (c->x86 < 6)
- clear_cpu_cap(c, X86_FEATURE_MCE);
-#endif
+ if (cpu_has_amd_erratum(c, amd_erratum_383))
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+}
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
- if (!c->x86_model_id[0]) {
- switch (c->x86) {
- case 0xf:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
/* re-enable TopologyExtensions if switched off by BIOS */
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
+ if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
@@ -625,14 +617,60 @@ static void init_amd(struct cpuinfo_x86 *c)
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
*/
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
-
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
value |= 0x1E;
wrmsrl_safe(0xc0011021, value);
}
}
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ u32 dummy;
+
+#ifdef CONFIG_SMP
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 0xf)
+ msr_set_bit(MSR_K7_HWCR, 6);
+#endif
+
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* get apicid instead of initial apic id from cpuid */
+ c->apicid = hard_smp_processor_id();
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x15: init_amd_bd(c); break;
+ }
+
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
cpu_detect_cache_sizes(c);
@@ -656,33 +694,6 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
}
-#ifdef CONFIG_X86_64
- if (c->x86 == 0x10) {
- /* do this for boot cpu */
- if (c == &boot_cpu_data)
- check_enable_amd_mmconf_dmi();
-
- fam10h_check_enable_mmcfg();
- }
-
- if (c == &boot_cpu_data && c->x86 >= 0xf) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
- }
-#endif
-
/*
* Family 0x12 and above processors have APIC timer
* running in deep C states.
@@ -690,34 +701,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- if (c->x86 == 0x10) {
- /*
- * Disable GART TLB Walk Errors on Fam10h. We do this here
- * because this is always needed when GART is enabled, even in a
- * kernel which has no MCE support built in.
- * BIOS should disable GartTlbWlk Errors already. If
- * it doesn't, do it here as suggested by the BKDG.
- *
- * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
- */
- msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
-
- /*
- * On family 10h BIOS may not have properly enabled WC+ support,
- * causing it to be converted to CD memtype. This may result in
- * performance degradation for certain nested-paging guests.
- * Prevent this conversion by clearing bit 24 in
- * MSR_AMD64_BU_CFG2.
- *
- * NOTE: we want to use the _safe accessors so as not to #GP kvm
- * guests on older kvm hosts.
- */
- msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
-
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
- }
-
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
@@ -741,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
}
#endif
-static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
-{
- tlb_flushall_shift = 6;
-}
-
static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
{
u32 ebx, eax, ecx, edx;
@@ -793,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
-
- cpu_set_tlb_flushall_shift(c);
}
static const struct cpu_dev amd_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ef1b93f18ed..e4ab2b42bd6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -148,6 +148,7 @@ static int __init x86_xsave_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
return 1;
@@ -161,6 +162,13 @@ static int __init x86_xsaveopt_setup(char *s)
}
__setup("noxsaveopt", x86_xsaveopt_setup);
+static int __init x86_xsaves_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ return 1;
+}
+__setup("noxsaves", x86_xsaves_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
static int disable_x86_serial_nr = 1;
@@ -481,26 +489,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
u16 __read_mostly tlb_lld_4m[NR_INFO];
u16 __read_mostly tlb_lld_1g[NR_INFO];
-/*
- * tlb_flushall_shift shows the balance point in replacing cr3 write
- * with multiple 'invlpg'. It will do this replacement when
- * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
- * If tlb_flushall_shift is -1, means the replacement will be disabled.
- */
-s8 __read_mostly tlb_flushall_shift = -1;
-
void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
if (this_cpu->c_detect_tlb)
this_cpu->c_detect_tlb(c);
printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
- "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
- "tlb_flushall_shift: %d\n",
+ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- tlb_lld_1g[ENTRIES], tlb_flushall_shift);
+ tlb_lld_1g[ENTRIES]);
}
void detect_ht(struct cpuinfo_x86 *c)
@@ -634,6 +633,15 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[9] = ebx;
}
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[10] = eax;
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a80029035bf..74e804ddc5c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -253,7 +253,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
*/
if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
(c->x86_mask < 0x6 || c->x86_mask == 0xb))
- set_cpu_cap(c, X86_FEATURE_11AP);
+ set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
@@ -370,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c)
*/
detect_extended_topology(c);
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+ }
+
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -391,7 +402,7 @@ static void init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 6 && cpu_has_clflush &&
(c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
- set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
@@ -438,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
-
/* Work around errata */
srat_detect_node(c);
@@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc)
}
}
-static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
-{
- switch ((c->x86 << 8) + c->x86_model) {
- case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 0x61d: /* six-core 45 nm xeon "Dunnington" */
- tlb_flushall_shift = -1;
- break;
- case 0x63a: /* Ivybridge */
- tlb_flushall_shift = 2;
- break;
- case 0x61a: /* 45 nm nehalem, "Bloomfield" */
- case 0x61e: /* 45 nm nehalem, "Lynnfield" */
- case 0x625: /* 32 nm nehalem, "Clarkdale" */
- case 0x62c: /* 32 nm nehalem, "Gulftown" */
- case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
- case 0x62f: /* 32 nm Xeon E7 */
- case 0x62a: /* SandyBridge */
- case 0x62d: /* SandyBridge, "Romely-EP" */
- default:
- tlb_flushall_shift = 6;
- }
-}
-
static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
int i, j, n;
@@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
for (j = 1 ; j < 16 ; j++)
intel_tlb_lookup(desc[j]);
}
- intel_tlb_flushall_shift_set(c);
}
static const struct cpu_dev intel_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a952e9c85b6..c7035073dfc 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -461,7 +461,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- if (strict_strtoul(buf, 10, &val) < 0)
+ if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
@@ -511,7 +511,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return -EINVAL;
- if (strict_strtoul(buf, 16, &val) < 0)
+ if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
if (amd_set_subcaches(cpu, val))
@@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
#endif
}
+#ifdef CONFIG_X86_HT
+ /*
+ * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * turns means that the only possibility is SMT (as indicated in
+ * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
+ * that SMT shares all caches, we can unconditionally set cpu_llc_id to
+ * c->phys_proc_id.
+ */
+ if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+#endif
+
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
return l2;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bb92f38153b..bd9ccda8087 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2136,7 +2136,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
attr_to_bank(attr)->ctl = new;
@@ -2174,7 +2174,7 @@ static ssize_t set_ignore_ce(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.ignore_ce ^ !!new) {
@@ -2198,7 +2198,7 @@ static ssize_t set_cmci_disabled(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.cmci_disabled ^ !!new) {
@@ -2385,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
mce_intel_hcpu_update(cpu);
+
+ /* intentionally ignoring frozen here */
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_rediscover();
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
@@ -2396,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
}
- if (action == CPU_POST_DEAD) {
- /* intentionally ignoring frozen here */
- cmci_rediscover();
- }
-
return NOTIFY_OK;
}
@@ -2451,6 +2450,12 @@ static __init int mcheck_init_device(void)
for_each_online_cpu(i) {
err = mce_device_create(i);
if (err) {
+ /*
+ * Register notifier anyway (and do not unreg it) so
+ * that we don't leave undeleted timers, see notifier
+ * callback above.
+ */
+ __register_hotcpu_notifier(&mce_cpu_notifier);
cpu_notifier_register_done();
goto err_device_create;
}
@@ -2471,10 +2476,6 @@ static __init int mcheck_init_device(void)
err_register:
unregister_syscore_ops(&mce_syscore_ops);
- cpu_notifier_register_begin();
- __unregister_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
-
err_device_create:
/*
* We didn't keep track of which devices were created above, but
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 603df4f7464..1e49f8f4127 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -353,7 +353,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
if (!b->interrupt_capable)
return -EINVAL;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
b->interrupt_enable = !!new;
@@ -372,7 +372,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
struct thresh_restart tr;
unsigned long new;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
if (new > THRESHOLD_MAX)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 9a316b21df8..3bdb95ae8c4 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
-static DEFINE_SPINLOCK(cmci_discover_lock);
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
@@ -144,14 +144,14 @@ static void cmci_storm_disable_banks(void)
int bank;
u64 val;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = __get_cpu_var(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static bool cmci_storm_detect(void)
@@ -211,7 +211,7 @@ static void cmci_discover(int banks)
int i;
int bios_wrong_thresh = 0;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
@@ -266,7 +266,7 @@ static void cmci_discover(int banks)
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
@@ -316,10 +316,10 @@ void cmci_clear(void)
if (!cmci_supported(&banks))
return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++)
__cmci_disable_bank(i);
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void cmci_rediscover_work_func(void *arg)
@@ -360,9 +360,9 @@ void cmci_disable_bank(int bank)
if (!cmci_supported(&banks))
return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
__cmci_disable_bank(bank);
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 2bf61650549..e2b22df964c 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,23 +1,25 @@
#!/bin/sh
#
-# Generate the x86_cap_flags[] array from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
#
IN=$1
OUT=$2
-TABS="$(printf '\t\t\t\t\t')"
-trap 'rm "$OUT"' EXIT
+function dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
-(
- echo "#ifndef _ASM_X86_CPUFEATURE_H"
- echo "#include <asm/cpufeature.h>"
- echo "#endif"
- echo ""
- echo "const char * const x86_cap_flags[NCAPINTS*32] = {"
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
+
+ echo "const char * const $ARRAY[$SIZE] = {"
- # Iterate through any input lines starting with #define X86_FEATURE_
- sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN |
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
while read i
do
# Name is everything up to the first whitespace
@@ -31,11 +33,32 @@ trap 'rm "$OUT"' EXIT
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
- TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 ))
- printf "\t[%s]%.*s = %s,\n" \
- "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
done
echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURE_H"
+ echo "#include <asm/cpufeature.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bdfbff8a4f..2879ecdaac4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
reg->idx = er->idx;
reg->config = event->attr.config1;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3b2f9bdd974..8ade93111e0 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -295,14 +295,16 @@ struct extra_reg {
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
- .event = (e), \
- .msr = (ms), \
- .config_mask = (m), \
- .valid_mask = (vm), \
- .idx = EXTRA_REG_##i, \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 3bbdf4cd38b..30790d798e6 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -294,31 +294,41 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
cpu_to_node(cpu));
}
-static void amd_uncore_cpu_up_prepare(unsigned int cpu)
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
{
- struct amd_uncore *uncore;
+ struct amd_uncore *uncore_nb = NULL, *uncore_l2;
if (amd_uncore_nb) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_NB;
- uncore->rdpmc_base = RDPMC_BASE_NB;
- uncore->msr_base = MSR_F15H_NB_PERF_CTL;
- uncore->active_mask = &amd_nb_active_mask;
- uncore->pmu = &amd_nb_pmu;
- *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+ uncore_nb = amd_uncore_alloc(cpu);
+ if (!uncore_nb)
+ goto fail;
+ uncore_nb->cpu = cpu;
+ uncore_nb->num_counters = NUM_COUNTERS_NB;
+ uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+ uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+ uncore_nb->active_mask = &amd_nb_active_mask;
+ uncore_nb->pmu = &amd_nb_pmu;
+ *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
if (amd_uncore_l2) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_L2;
- uncore->rdpmc_base = RDPMC_BASE_L2;
- uncore->msr_base = MSR_F16H_L2I_PERF_CTL;
- uncore->active_mask = &amd_l2_active_mask;
- uncore->pmu = &amd_l2_pmu;
- *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+ uncore_l2 = amd_uncore_alloc(cpu);
+ if (!uncore_l2)
+ goto fail;
+ uncore_l2->cpu = cpu;
+ uncore_l2->num_counters = NUM_COUNTERS_L2;
+ uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+ uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+ uncore_l2->active_mask = &amd_l2_active_mask;
+ uncore_l2->pmu = &amd_l2_pmu;
+ *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
}
+
+ return 0;
+
+fail:
+ kfree(uncore_nb);
+ return -ENOMEM;
}
static struct amd_uncore *
@@ -441,7 +451,7 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
if (!--uncore->refcnt)
kfree(uncore);
- *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+ *per_cpu_ptr(uncores, cpu) = NULL;
}
static void amd_uncore_cpu_dead(unsigned int cpu)
@@ -461,7 +471,8 @@ amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- amd_uncore_cpu_up_prepare(cpu);
+ if (amd_uncore_cpu_up_prepare(cpu))
+ return notifier_from_errno(-ENOMEM);
break;
case CPU_STARTING:
@@ -501,20 +512,33 @@ static void __init init_cpu_already_online(void *dummy)
amd_uncore_cpu_online(cpu);
}
+static void cleanup_cpu_online(void *dummy)
+{
+ unsigned int cpu = smp_processor_id();
+
+ amd_uncore_cpu_dead(cpu);
+}
+
static int __init amd_uncore_init(void)
{
- unsigned int cpu;
+ unsigned int cpu, cpu2;
int ret = -ENODEV;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return -ENODEV;
+ goto fail_nodev;
if (!cpu_has_topoext)
- return -ENODEV;
+ goto fail_nodev;
if (cpu_has_perfctr_nb) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (!amd_uncore_nb) {
+ ret = -ENOMEM;
+ goto fail_nb;
+ }
+ ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (ret)
+ goto fail_nb;
printk(KERN_INFO "perf: AMD NB counters detected\n");
ret = 0;
@@ -522,20 +546,28 @@ static int __init amd_uncore_init(void)
if (cpu_has_perfctr_l2) {
amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ if (!amd_uncore_l2) {
+ ret = -ENOMEM;
+ goto fail_l2;
+ }
+ ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ if (ret)
+ goto fail_l2;
printk(KERN_INFO "perf: AMD L2I counters detected\n");
ret = 0;
}
if (ret)
- return -ENODEV;
+ goto fail_nodev;
cpu_notifier_register_begin();
/* init cpus already online before registering for hotplug notifier */
for_each_online_cpu(cpu) {
- amd_uncore_cpu_up_prepare(cpu);
+ ret = amd_uncore_cpu_up_prepare(cpu);
+ if (ret)
+ goto fail_online;
smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
}
@@ -543,5 +575,30 @@ static int __init amd_uncore_init(void)
cpu_notifier_register_done();
return 0;
+
+
+fail_online:
+ for_each_online_cpu(cpu2) {
+ if (cpu2 == cpu)
+ break;
+ smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+ }
+ cpu_notifier_register_done();
+
+ /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+ amd_uncore_nb = amd_uncore_l2 = NULL;
+ if (cpu_has_perfctr_l2)
+ perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+ if (cpu_has_perfctr_nb)
+ perf_pmu_unregister(&amd_nb_pmu);
+ if (amd_uncore_l2)
+ free_percpu(amd_uncore_l2);
+fail_nb:
+ if (amd_uncore_nb)
+ free_percpu(amd_uncore_nb);
+
+fail_nodev:
+ return ret;
}
device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index adb02aa62af..2502d0d9d24 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1382,6 +1382,15 @@ again:
intel_pmu_lbr_read();
/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -2173,6 +2182,41 @@ static void intel_snb_check_microcode(void)
}
}
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrl_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrl.
+ */
+ val_tmp = val_old ^ mask;
+ if (wrmsrl_safe(msr, val_tmp) ||
+ rdmsrl_safe(msr, &val_new))
+ return false;
+
+ if (val_new != val_tmp)
+ return false;
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrl(msr, val_old);
+
+ return true;
+}
+
static __init void intel_sandybridge_quirk(void)
{
x86_pmu.check_microcode = intel_snb_check_microcode;
@@ -2262,7 +2306,8 @@ __init int intel_pmu_init(void)
union cpuid10_ebx ebx;
struct event_constraint *c;
unsigned int unused;
- int version;
+ struct extra_reg *er;
+ int version, i;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -2465,6 +2510,9 @@ __init int intel_pmu_init(void)
case 62: /* IvyBridge EP */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
sizeof(hw_cache_extra_regs));
@@ -2565,6 +2613,34 @@ __init int intel_pmu_init(void)
}
}
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support LBR MSR
+ * Check all LBT MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (x86_pmu.extra_regs) {
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+ }
+
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 980970cb744..696ade311de 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -311,9 +311,11 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node);
- if (unlikely(!buffer))
+ buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
+ }
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
thresh = max / 16;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 65bbbea38b9..cfc6f9dfcd9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -550,16 +550,16 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
@@ -1222,6 +1222,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+
SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
@@ -1245,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
@@ -2946,10 +2947,7 @@ again:
* extra registers. If we failed to take an extra
* register, try the alternative.
*/
- if (idx % 2)
- idx--;
- else
- idx++;
+ idx ^= 1;
if (idx != reg1->idx % 6) {
if (idx == 2)
config1 >>= 8;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 06fe3ed8b85..5433658e598 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -97,6 +97,14 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+ seq_printf(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index b6f794aa169..4a8013d5594 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -38,7 +38,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 507de806659..0553a34fa0d 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -4,9 +4,14 @@
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
*
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -16,6 +21,7 @@
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -28,6 +34,45 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
+/* Alignment required for elf header segment */
+#define ELF_CORE_HEADER_ALIGN 4096
+
+/* This primarily represents number of split ranges due to exclusion */
+#define CRASH_MAX_RANGES 16
+
+struct crash_mem_range {
+ u64 start, end;
+};
+
+struct crash_mem {
+ unsigned int nr_ranges;
+ struct crash_mem_range ranges[CRASH_MAX_RANGES];
+};
+
+/* Misc data about ram ranges needed to prepare elf headers */
+struct crash_elf_data {
+ struct kimage *image;
+ /*
+ * Total number of ram ranges we have after various adjustments for
+ * GART, crash reserved region etc.
+ */
+ unsigned int max_nr_ranges;
+ unsigned long gart_start, gart_end;
+
+ /* Pointer to elf header */
+ void *ehdr;
+ /* Pointer to next phdr */
+ void *bufp;
+ struct crash_mem mem;
+};
+
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
+
int in_crash_kexec;
/*
@@ -39,6 +84,7 @@ int in_crash_kexec;
*/
crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+unsigned long crash_zero_bytes;
static inline void cpu_crash_vmclear_loaded_vmcss(void)
{
@@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#endif
crash_save_cpu(regs, safe_smp_processor_id());
}
+
+#ifdef CONFIG_X86_64
+
+static int get_nr_ram_ranges_callback(unsigned long start_pfn,
+ unsigned long nr_pfn, void *arg)
+{
+ int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+
+ ced->gart_start = start;
+ ced->gart_end = end;
+
+ /* Not expecting more than 1 gart aperture */
+ return 1;
+}
+
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static void fill_up_crash_elf_data(struct crash_elf_data *ced,
+ struct kimage *image)
+{
+ unsigned int nr_ranges = 0;
+
+ ced->image = image;
+
+ walk_system_ram_range(0, -1, &nr_ranges,
+ get_nr_ram_ranges_callback);
+
+ ced->max_nr_ranges = nr_ranges;
+
+ /*
+ * We don't create ELF headers for GART aperture as an attempt
+ * to dump this memory in second kernel leads to hang/crash.
+ * If gart aperture is present, one needs to exclude that region
+ * and that could lead to need of extra phdr.
+ */
+ walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
+ ced, get_gart_ranges_callback);
+
+ /*
+ * If we have gart region, excluding that could potentially split
+ * a memory range, resulting in extra header. Account for that.
+ */
+ if (ced->gart_end)
+ ced->max_nr_ranges++;
+
+ /* Exclusion of crash region could split memory ranges */
+ ced->max_nr_ranges++;
+
+ /* If crashk_low_res is not 0, another range split possible */
+ if (crashk_low_res.end != 0)
+ ced->max_nr_ranges++;
+}
+
+static int exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end;
+ struct crash_mem_range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ mstart = start;
+ if (mend > end)
+ mend = end;
+
+ /* Found completely overlapping range */
+ if (mstart == start && mend == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (mstart > start && mend < end) {
+ /* Split original range */
+ mem->ranges[i].end = mstart - 1;
+ temp_range.start = mend + 1;
+ temp_range.end = end;
+ } else if (mstart != start)
+ mem->ranges[i].end = mstart - 1;
+ else
+ mem->ranges[i].start = mend + 1;
+ break;
+ }
+
+ /* If a split happend, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == CRASH_MAX_RANGES - 1) {
+ pr_err("Too many crash ranges after split\n");
+ return -ENOMEM;
+ }
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in ced->mem.ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_elf_data *ced,
+ unsigned long long mstart, unsigned long long mend)
+{
+ struct crash_mem *cmem = &ced->mem;
+ int ret = 0;
+
+ memset(cmem->ranges, 0, sizeof(cmem->ranges));
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude crashkernel region */
+ ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
+ if (ret)
+ return ret;
+
+ /* Exclude GART region */
+ if (ced->gart_end) {
+ ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long mstart, mend;
+ struct kimage *image = ced->image;
+ struct crash_mem *cmem;
+ int ret, i;
+
+ ehdr = ced->ehdr;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(ced, start, end);
+ if (ret)
+ return ret;
+
+ /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */
+ cmem = &ced->mem;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ mstart = cmem->ranges[i].start;
+ mend = cmem->ranges[i].end;
+
+ phdr = ced->bufp;
+ ced->bufp += sizeof(Elf64_Phdr);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ /*
+ * If a range matches backup region, adjust offset to backup
+ * segment.
+ */
+ if (mstart == image->arch.backup_src_start &&
+ (mend - mstart + 1) == image->arch.backup_src_sz)
+ phdr->p_offset = image->arch.backup_load_addr;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_headers(struct crash_elf_data *ced,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf, *bufp;
+ unsigned int cpu;
+ unsigned long long notes_addr;
+ int ret;
+
+ /* extra phdr for vmcoreinfo elf note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += ced->max_nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area on x86_64 (ffffffff80000000 - ffffffffa0000000).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two elf headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ bufp = buf;
+ ehdr = (Elf64_Ehdr *)bufp;
+ bufp += sizeof(Elf64_Ehdr);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each present cpu */
+ for_each_present_cpu(cpu) {
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
+ (ehdr->e_phnum)++;
+
+#ifdef CONFIG_X86_64
+ /* Prepare PT_LOAD type program header for kernel text region */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ (ehdr->e_phnum)++;
+#endif
+
+ /* Prepare PT_LOAD headers for system ram chunks. */
+ ced->ehdr = ehdr;
+ ced->bufp = bufp;
+ ret = walk_system_ram_res(0, -1, ced,
+ prepare_elf64_ram_headers_callback);
+ if (ret < 0)
+ return ret;
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(struct kimage *image, void **addr,
+ unsigned long *sz)
+{
+ struct crash_elf_data *ced;
+ int ret;
+
+ ced = kzalloc(sizeof(*ced), GFP_KERNEL);
+ if (!ced)
+ return -ENOMEM;
+
+ fill_up_crash_elf_data(ced, image);
+
+ /* By default prepare 64bit headers */
+ ret = prepare_elf64_headers(ced, addr, sz);
+ kfree(ced);
+ return ret;
+}
+
+static int add_e820_entry(struct boot_params *params, struct e820entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820MAX)
+ return 1;
+
+ memcpy(&params->e820_map[nr_e820_entries], entry,
+ sizeof(struct e820entry));
+ params->e820_entries++;
+ return 0;
+}
+
+static int memmap_entry_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820entry ei;
+
+ ei.addr = start;
+ ei.size = end - start + 1;
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
+}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret = 0;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude Backup region */
+ start = image->arch.backup_load_addr;
+ end = start + image->arch.backup_src_sz - 1;
+ ret = exclude_mem_range(cmem, start, end);
+ if (ret)
+ return ret;
+
+ /* Exclude elf header region */
+ start = image->arch.elf_load_addr;
+ end = start + image->arch.elf_headers_sz - 1;
+ return exclude_mem_range(cmem, start, end);
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ cmem = vzalloc(sizeof(struct crash_mem));
+ if (!cmem)
+ return -ENOMEM;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add first 640K segment */
+ ei.addr = image->arch.backup_src_start;
+ ei.size = image->arch.backup_src_sz;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+
+ /* Add ACPI tables */
+ cmd.type = E820_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_NVS;
+ walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
+ crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int determine_backup_region(u64 start, u64 end, void *arg)
+{
+ struct kimage *image = arg;
+
+ image->arch.backup_src_start = start;
+ image->arch.backup_src_sz = end - start + 1;
+
+ /* Expecting only one range for backup region */
+ return 1;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ unsigned long src_start, src_sz, elf_sz;
+ void *elf_addr;
+ int ret;
+
+ /*
+ * Determine and load a segment for backup area. First 640K RAM
+ * region is backup source
+ */
+
+ ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
+ image, determine_backup_region);
+
+ /* Zero or postive return values are ok */
+ if (ret < 0)
+ return ret;
+
+ src_start = image->arch.backup_src_start;
+ src_sz = image->arch.backup_src_sz;
+
+ /* Add backup segment. */
+ if (src_sz) {
+ /*
+ * Ideally there is no source for backup segment. This is
+ * copied in purgatory after crash. Just add a zero filled
+ * segment for now to make sure checksum logic works fine.
+ */
+ ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
+ sizeof(crash_zero_bytes), src_sz,
+ PAGE_SIZE, 0, -1, 0,
+ &image->arch.backup_load_addr);
+ if (ret)
+ return ret;
+ pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
+ image->arch.backup_load_addr, src_start, src_sz);
+ }
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+ if (ret)
+ return ret;
+
+ image->arch.elf_headers = elf_addr;
+ image->arch.elf_headers_sz = elf_sz;
+
+ ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
+ ELF_CORE_HEADER_ALIGN, 0, -1, 0,
+ &image->arch.elf_load_addr);
+ if (ret) {
+ vfree((void *)image->arch.elf_headers);
+ return ret;
+ }
+ pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->arch.elf_load_addr, elf_sz, elf_sz);
+
+ return ret;
+}
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0da82b8e63..47c410d99f5 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -423,8 +423,9 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
+sysenter_after_call:
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -501,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -674,8 +676,13 @@ syscall_fault:
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC
@@ -1052,9 +1059,6 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
pushl %eax
pushl %ecx
pushl %edx
@@ -1086,8 +1090,6 @@ END(ftrace_caller)
ENTRY(ftrace_regs_caller)
pushf /* push flags before compare (in cs location) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
/*
* i386 does not save SS and ESP when coming from kernel.
@@ -1146,7 +1148,6 @@ GLOBAL(ftrace_regs_call)
popf /* Pop flags at end (no addl to corrupt flags) */
jmp ftrace_ret
-ftrace_restore_flags:
popf
jmp ftrace_stub
#else /* ! CONFIG_DYNAMIC_FTRACE */
@@ -1155,9 +1156,6 @@ ENTRY(mcount)
cmpl $__PAGE_OFFSET, %esp
jb ftrace_stub /* Paging not enabled yet? */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b25ca969edd..2fac1343a90 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -207,7 +207,6 @@ ENDPROC(native_usergs_sysret64)
*/
.macro XCPT_FRAME start=1 offset=0
INTR_FRAME \start, RIP+\offset-ORIG_RAX
- /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
.endm
/*
@@ -287,21 +286,21 @@ ENDPROC(native_usergs_sysret64)
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
+ movq %rdi, RDI+8(%rsp)
+ movq %rsi, RSI+8(%rsp)
movq_cfi rdx, RDX+8
movq_cfi rcx, RCX+8
movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
+ movq %r8, R8+8(%rsp)
+ movq %r9, R9+8(%rsp)
+ movq %r10, R10+8(%rsp)
+ movq %r11, R11+8(%rsp)
movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
+ movq %rbp, RBP+8(%rsp)
+ movq %r12, R12+8(%rsp)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
movl $1,%ebx
movl $MSR_GS_BASE,%ecx
rdmsr
@@ -830,27 +829,24 @@ restore_args:
RESTORE_ARGS 1,8,1
irq_return:
+ INTERRUPT_RETURN
+
+ENTRY(native_iret)
/*
* Are we returning to a stack segment from the LDT? Note: in
* 64-bit mode SS:RSP on the exception stack is always valid.
*/
#ifdef CONFIG_X86_ESPFIX64
testb $4,(SS-RIP)(%rsp)
- jnz irq_return_ldt
+ jnz native_irq_return_ldt
#endif
-irq_return_iret:
- INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return_iret, bad_iret)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
+native_irq_return_iret:
iretq
- _ASM_EXTABLE(native_iret, bad_iret)
-#endif
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)
#ifdef CONFIG_X86_ESPFIX64
-irq_return_ldt:
+native_irq_return_ldt:
pushq_cfi %rax
pushq_cfi %rdi
SWAPGS
@@ -872,7 +868,7 @@ irq_return_ldt:
SWAPGS
movq %rax,%rsp
popq_cfi %rax
- jmp irq_return_iret
+ jmp native_irq_return_iret
#endif
.section .fixup,"ax"
@@ -956,13 +952,8 @@ __do_double_fault:
cmpl $__KERNEL_CS,CS(%rdi)
jne do_double_fault
movq RIP(%rdi),%rax
- cmpq $irq_return_iret,%rax
-#ifdef CONFIG_PARAVIRT
- je 1f
- cmpq $native_iret,%rax
-#endif
+ cmpq $native_irq_return_iret,%rax
jne do_double_fault /* This shouldn't happen... */
-1:
movq PER_CPU_VAR(kernel_stack),%rax
subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
movq %rax,RSP(%rdi)
@@ -1395,21 +1386,21 @@ ENTRY(error_entry)
CFI_ADJUST_CFA_OFFSET 15*8
/* oldrax contains error code */
cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
+ movq %rdi, RDI+8(%rsp)
+ movq %rsi, RSI+8(%rsp)
+ movq %rdx, RDX+8(%rsp)
+ movq %rcx, RCX+8(%rsp)
+ movq %rax, RAX+8(%rsp)
+ movq %r8, R8+8(%rsp)
+ movq %r9, R9+8(%rsp)
+ movq %r10, R10+8(%rsp)
+ movq %r11, R11+8(%rsp)
movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
+ movq %rbp, RBP+8(%rsp)
+ movq %r12, R12+8(%rsp)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1427,8 +1418,9 @@ error_sti:
* compat mode. Check for these here too.
*/
error_kernelspace:
+ CFI_REL_OFFSET rcx, RCX+8
incl %ebx
- leaq irq_return_iret(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6afbb16e9b7..94d857fb103 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -175,7 +175,7 @@ void init_espfix_ap(void)
if (!pud_present(pud)) {
pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
- paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
set_pud(&pud_p[n], pud);
}
@@ -185,7 +185,7 @@ void init_espfix_ap(void)
if (!pmd_present(pmd)) {
pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
- paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
set_pmd(&pmd_p[n], pmd);
}
@@ -193,7 +193,6 @@ void init_espfix_ap(void)
pte_p = pte_offset_kernel(&pmd, addr);
stack_page = (void *)__get_free_page(GFP_KERNEL);
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
- paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cbc4a91b131..3386dc9aa33 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -703,6 +703,9 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
unsigned long return_hooker = (unsigned long)
&return_to_handler;
+ if (unlikely(ftrace_graph_is_dead()))
+ return;
+
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5dd8081441..a9a4229f616 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -375,7 +375,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
/*
* These bits must be zero.
*/
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+ memset(xsave_hdr->reserved, 0, 48);
return ret;
}
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 00000000000..9642b9b3365
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,553 @@
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/verify_pefile.h>
+#include <keys/system_keyring.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ memcpy(cmdline_ptr, cmdline, cmdline_len);
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr + cmdline_len - 1,
+ " elfcorehdr=0x%lx", image->arch.elf_load_addr);
+ cmdline_len += len;
+ }
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ pr_debug("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_saved.nr_map;
+
+ /* TODO: Pass entries more than E820MAX in bootparams setup data */
+ if (nr_e820_entries > E820MAX)
+ nr_e820_entries = E820MAX;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(&params->e820_map, &e820_saved.map,
+ nr_e820_entries * sizeof(struct e820entry));
+
+ return 0;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi.fw_vendor;
+ esd->runtime = efi.runtime;
+ esd->tables = efi.config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ /*
+ * If 1:1 mapping is not enabled, second kernel can not setup EFI
+ * and use EFI run time services. User space will have to pass
+ * acpi_rsdp=<addr> on kernel command line to make second kernel boot
+ * without efi.
+ */
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(&params->screen_info, &boot_params.screen_info,
+ sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Default APM info */
+ memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(&params->hd0_info, 0, sizeof(params->hd0_info));
+ memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+
+ /* Default sysdesc table */
+ params->sys_desc_table.length = 0;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ for (i = 0; i < nr_e820_entries; i++) {
+ if (params->e820_map[i].type != E820_RAM)
+ continue;
+ start = params->e820_map[i].addr;
+ end = params->e820_map[i].addr + params->e820_map[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+#endif
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be atleast two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ unsigned long purgatory_load_addr;
+ unsigned long kernel_bufsz, kernel_memsz, kernel_align;
+ char *kernel_buf;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1,
+ &purgatory_load_addr);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ efi_map_sz = ALIGN(efi_map_sz, 16);
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ params_misc_sz = params_cmdline_sz + efi_map_sz +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+
+ params = kzalloc(params_misc_sz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + efi_map_sz;
+
+ /* Copy setup header onto bootparams. Documentation/x86/boot.txt */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
+ params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
+ ULONG_MAX, 1, &bootparam_load_addr);
+ if (ret)
+ goto out_free_params;
+ pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, params_misc_sz, params_misc_sz);
+
+ /* Load kernel */
+ kernel_buf = kernel + kern16_size;
+ kernel_bufsz = kernel_len - kern16_size;
+ kernel_memsz = PAGE_ALIGN(header->init_size);
+ kernel_align = header->kernel_alignment;
+
+ ret = kexec_add_buffer(image, kernel_buf,
+ kernel_bufsz, kernel_memsz, kernel_align,
+ MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
+ &kernel_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kernel_memsz, kernel_memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
+ PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
+ ULONG_MAX, 1, &initrd_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
+{
+ bool trusted;
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ system_trusted_keyring, &trusted);
+ if (ret < 0)
+ return ret;
+ if (!trusted)
+ return -EKEYREJECTED;
+ return 0;
+}
+#endif
+
+struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = bzImage64_verify_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 7596df66490..67e6d19ef1b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -574,6 +574,9 @@ int kprobe_int3_handler(struct pt_regs *regs)
struct kprobe *p;
struct kprobe_ctlblk *kcb;
+ if (user_mode_vm(regs))
+ return 0;
+
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
* We don't want to be preempted for the entire
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 679cef0791c..8b04018e5d1 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
@@ -21,6 +23,11 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+
+static struct kexec_file_ops *kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+};
static void free_transition_pgtable(struct kimage *image)
{
@@ -171,6 +178,38 @@ static void load_segments(void)
);
}
+/* Update purgatory as needed after various image segments have been prepared */
+static int arch_update_purgatory(struct kimage *image)
+{
+ int ret = 0;
+
+ if (!image->file_mode)
+ return 0;
+
+ /* Setup copying of backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
+ &image->arch.backup_load_addr,
+ sizeof(image->arch.backup_load_addr), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_src",
+ &image->arch.backup_src_start,
+ sizeof(image->arch.backup_src_start), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
+ &image->arch.backup_src_sz,
+ sizeof(image->arch.backup_src_sz), 0);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
@@ -184,6 +223,11 @@ int machine_kexec_prepare(struct kimage *image)
if (result)
return result;
+ /* update purgatory as needed */
+ result = arch_update_purgatory(image);
+ if (result)
+ return result;
+
return 0;
}
@@ -283,3 +327,198 @@ void arch_crash_save_vmcoreinfo(void)
(unsigned long)&_text - __START_KERNEL);
}
+/* arch-dependent functionality related to kexec file-based syscall */
+
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ int i, ret = -ENOEXEC;
+ struct kexec_file_ops *fops;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
+ fops = kexec_file_loaders[i];
+ if (!fops || !fops->probe)
+ continue;
+
+ ret = fops->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+ vfree(image->arch.elf_headers);
+ image->arch.elf_headers = NULL;
+
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
+}
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
+}
+
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
+ unsigned long kernel_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(kernel, kernel_len);
+}
+
+/*
+ * Apply purgatory relocations.
+ *
+ * ehdr: Pointer to elf headers
+ * sechdrs: Pointer to section headers.
+ * relsec: section index of SHT_RELA section.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
+ Elf64_Shdr *sechdrs, unsigned int relsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ Elf64_Shdr *section, *symtabsec;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+
+ /*
+ * ->sh_offset has been modified to keep the pointer to section
+ * contents in memory
+ */
+ rel = (void *)sechdrs[relsec].sh_offset;
+
+ /* Section to which relocations apply */
+ section = &sechdrs[sechdrs[relsec].sh_info];
+
+ pr_debug("Applying relocate section %u to %u\n", relsec,
+ sechdrs[relsec].sh_info);
+
+ /* Associated symbol table */
+ symtabsec = &sechdrs[sechdrs[relsec].sh_link];
+
+ /* String table */
+ if (symtabsec->sh_link >= ehdr->e_shnum) {
+ /* Invalid strtab section number */
+ pr_err("Invalid string table section index %d\n",
+ symtabsec->sh_link);
+ return -ENOEXEC;
+ }
+
+ strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset;
+
+ /* section header string table */
+ shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset;
+
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update (->sh_offset). This is temporary
+ * buffer where section is currently loaded. This will finally
+ * be loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = (void *)(section->sh_offset + rel[i].r_offset);
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (Elf64_Sym *)symtabsec->sh_offset +
+ ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index c050a015316..c73aecf10d3 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -46,10 +46,6 @@ END(function_hook)
.endm
ENTRY(ftrace_caller)
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
ftrace_caller_setup
/* regs go into 4th parameter (but make it NULL) */
movq $0, %rcx
@@ -73,10 +69,6 @@ ENTRY(ftrace_regs_caller)
/* Save the current flags before compare (in SS location)*/
pushfq
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
-
/* skip=8 to skip flags saved in SS */
ftrace_caller_setup 8
@@ -131,7 +123,7 @@ GLOBAL(ftrace_regs_call)
popfq
jmp ftrace_return
-ftrace_restore_flags:
+
popfq
jmp ftrace_stub
@@ -141,9 +133,6 @@ END(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(function_hook)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93e..a1da6737ba5 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
new file mode 100644
index 00000000000..0c424a67985
--- /dev/null
+++ b/arch/x86/kernel/pmc_atom.c
@@ -0,0 +1,321 @@
+/*
+ * Intel Atom SOC Power Management Controller Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+
+#include <asm/pmc_atom.h>
+
+#define DRIVER_NAME KBUILD_MODNAME
+
+struct pmc_dev {
+ u32 base_addr;
+ void __iomem *regmap;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *dbgfs_dir;
+#endif /* CONFIG_DEBUG_FS */
+};
+
+static struct pmc_dev pmc_device;
+static u32 acpi_base_addr;
+
+struct pmc_dev_map {
+ const char *name;
+ u32 bit_mask;
+};
+
+static const struct pmc_dev_map dev_map[] = {
+ {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
+ {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
+ {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
+ {"3 - LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1},
+ {"4 - LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2},
+ {"5 - LPSS1_F5_SPI", BIT_LPSS1_F5_SPI},
+ {"6 - LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX},
+ {"7 - LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX},
+ {"8 - SCC_EMMC", BIT_SCC_EMMC},
+ {"9 - SCC_SDIO", BIT_SCC_SDIO},
+ {"10 - SCC_SDCARD", BIT_SCC_SDCARD},
+ {"11 - SCC_MIPI", BIT_SCC_MIPI},
+ {"12 - HDA", BIT_HDA},
+ {"13 - LPE", BIT_LPE},
+ {"14 - OTG", BIT_OTG},
+ {"15 - USH", BIT_USH},
+ {"16 - GBE", BIT_GBE},
+ {"17 - SATA", BIT_SATA},
+ {"18 - USB_EHCI", BIT_USB_EHCI},
+ {"19 - SEC", BIT_SEC},
+ {"20 - PCIE_PORT0", BIT_PCIE_PORT0},
+ {"21 - PCIE_PORT1", BIT_PCIE_PORT1},
+ {"22 - PCIE_PORT2", BIT_PCIE_PORT2},
+ {"23 - PCIE_PORT3", BIT_PCIE_PORT3},
+ {"24 - LPSS2_F0_DMA", BIT_LPSS2_F0_DMA},
+ {"25 - LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1},
+ {"26 - LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2},
+ {"27 - LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3},
+ {"28 - LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4},
+ {"29 - LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5},
+ {"30 - LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6},
+ {"31 - LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7},
+ {"32 - SMB", BIT_SMB},
+ {"33 - OTG_SS_PHY", BIT_OTG_SS_PHY},
+ {"34 - USH_SS_PHY", BIT_USH_SS_PHY},
+ {"35 - DFX", BIT_DFX},
+};
+
+static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
+{
+ return readl(pmc->regmap + reg_offset);
+}
+
+static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
+{
+ writel(val, pmc->regmap + reg_offset);
+}
+
+static void pmc_power_off(void)
+{
+ u16 pm1_cnt_port;
+ u32 pm1_cnt_value;
+
+ pr_info("Preparing to enter system sleep state S5\n");
+
+ pm1_cnt_port = acpi_base_addr + PM1_CNT;
+
+ pm1_cnt_value = inl(pm1_cnt_port);
+ pm1_cnt_value &= SLEEP_TYPE_MASK;
+ pm1_cnt_value |= SLEEP_TYPE_S5;
+ pm1_cnt_value |= SLEEP_ENABLE;
+
+ outl(pm1_cnt_value, pm1_cnt_port);
+}
+
+static void pmc_hw_reg_setup(struct pmc_dev *pmc)
+{
+ /*
+ * Disable PMC S0IX_WAKE_EN events coming from:
+ * - LPC clock run
+ * - GPIO_SUS ored dedicated IRQs
+ * - GPIO_SCORE ored dedicated IRQs
+ * - GPIO_SUS shared IRQ
+ * - GPIO_SCORE shared IRQ
+ */
+ pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int pmc_dev_state_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u32 func_dis, func_dis_2, func_dis_index;
+ u32 d3_sts_0, d3_sts_1, d3_sts_index;
+ int dev_num, dev_index, reg_index;
+
+ func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
+ func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
+ d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
+ d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
+
+ dev_num = ARRAY_SIZE(dev_map);
+
+ for (dev_index = 0; dev_index < dev_num; dev_index++) {
+ reg_index = dev_index / PMC_REG_BIT_WIDTH;
+ if (reg_index) {
+ func_dis_index = func_dis_2;
+ d3_sts_index = d3_sts_1;
+ } else {
+ func_dis_index = func_dis;
+ d3_sts_index = d3_sts_0;
+ }
+
+ seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
+ dev_map[dev_index].name,
+ dev_map[dev_index].bit_mask & func_dis_index ?
+ "Disabled" : "Enabled ",
+ dev_map[dev_index].bit_mask & d3_sts_index ?
+ "D3" : "D0");
+ }
+ return 0;
+}
+
+static int pmc_dev_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_dev_state_ops = {
+ .open = pmc_dev_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
+
+ s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
+ s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
+ s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
+ s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
+ s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
+
+ seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
+ seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
+ seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
+ seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
+ seq_printf(s, "S0 Residency:\t%lldus\n", s0_tmr);
+ return 0;
+}
+
+static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_sleep_tmr_show, inode->i_private);
+}
+
+static const struct file_operations pmc_sleep_tmr_ops = {
+ .open = pmc_sleep_tmr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
+{
+ if (!pmc->dbgfs_dir)
+ return;
+
+ debugfs_remove_recursive(pmc->dbgfs_dir);
+ pmc->dbgfs_dir = NULL;
+}
+
+static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+{
+ struct dentry *dir, *f;
+
+ dir = debugfs_create_dir("pmc_atom", NULL);
+ if (!dir)
+ return -ENOMEM;
+
+ f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_dev_state_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "dev_states register failed\n");
+ goto err;
+ }
+ f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_sleep_tmr_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "sleep_state register failed\n");
+ goto err;
+ }
+ pmc->dbgfs_dir = dir;
+ return 0;
+err:
+ pmc_dbgfs_unregister(pmc);
+ return -ENODEV;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static int pmc_setup_dev(struct pci_dev *pdev)
+{
+ struct pmc_dev *pmc = &pmc_device;
+ int ret;
+
+ /* Obtain ACPI base address */
+ pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
+ acpi_base_addr &= ACPI_BASE_ADDR_MASK;
+
+ /* Install power off function */
+ if (acpi_base_addr != 0 && pm_power_off == NULL)
+ pm_power_off = pmc_power_off;
+
+ pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
+ pmc->base_addr &= PMC_BASE_ADDR_MASK;
+
+ pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
+ if (!pmc->regmap) {
+ dev_err(&pdev->dev, "error: ioremap failed\n");
+ return -ENOMEM;
+ }
+
+ /* PMC hardware registers setup */
+ pmc_hw_reg_setup(pmc);
+
+#ifdef CONFIG_DEBUG_FS
+ ret = pmc_dbgfs_register(pmc, pdev);
+ if (ret) {
+ iounmap(pmc->regmap);
+ return ret;
+ }
+#endif /* CONFIG_DEBUG_FS */
+ return 0;
+}
+
+/*
+ * Data for PCI driver interface
+ *
+ * This data only exists for exporting the supported
+ * PCI ids via MODULE_DEVICE_TABLE. We do not actually
+ * register a pci_driver, because lpc_ich will register
+ * a driver on the same PCI id.
+ */
+static const struct pci_device_id pmc_pci_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
+ { 0, },
+};
+
+MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
+
+static int __init pmc_atom_init(void)
+{
+ int err = -ENODEV;
+ struct pci_dev *pdev = NULL;
+ const struct pci_device_id *ent;
+
+ /* We look for our device - PCU PMC
+ * we assume that there is max. one device.
+ *
+ * We can't use plain pci_driver mechanism,
+ * as the device is really a multiple function device,
+ * main driver that binds to the pci_device is lpc_ich
+ * and have to find & bind to the device this way.
+ */
+ for_each_pci_dev(pdev) {
+ ent = pci_match_id(pmc_pci_ids, pdev);
+ if (ent) {
+ err = pmc_setup_dev(pdev);
+ goto out;
+ }
+ }
+ /* Device not found. */
+out:
+ return err;
+}
+
+module_init(pmc_atom_init);
+/* no module_exit, this driver shouldn't be unloaded */
+
+MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4505e2a950d..f804dc935d2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -93,6 +93,7 @@ void arch_task_cache_init(void)
kmem_cache_create("task_xstate", xstate_size,
__alignof__(union thread_xstate),
SLAB_PANIC | SLAB_NOTRACK, NULL);
+ setup_xstate_comp();
}
/*
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 52b1157c53e..17962e667a9 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -28,6 +28,7 @@
#include <linux/mc146818rtc.h>
#include <asm/realmode.h>
#include <asm/x86_init.h>
+#include <asm/efi.h>
/*
* Power off function, if any
@@ -401,12 +402,25 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
static int __init reboot_init(void)
{
+ int rv;
+
/*
* Only do the DMI check if reboot_type hasn't been overridden
* on the command line
*/
- if (reboot_default)
- dmi_check_system(reboot_dmi_table);
+ if (!reboot_default)
+ return 0;
+
+ /*
+ * The DMI quirks table takes precedence. If no quirks entry
+ * matches and the ACPI Hardware Reduced bit is set, force EFI
+ * reboot.
+ */
+ rv = dmi_check_system(reboot_dmi_table);
+
+ if (!rv && efi_reboot_required())
+ reboot_type = BOOT_EFI;
+
return 0;
}
core_initcall(reboot_init);
@@ -528,11 +542,7 @@ static void native_machine_emergency_restart(void)
break;
case BOOT_EFI:
- if (efi_enabled(EFI_RUNTIME_SERVICES))
- efi.reset_system(reboot_mode == REBOOT_WARM ?
- EFI_RESET_WARM :
- EFI_RESET_COLD,
- EFI_SUCCESS, 0, NULL);
+ efi_reboot(reboot_mode, NULL);
reboot_type = BOOT_BIOS;
break;
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 2a26819bb6a..80eab01c1a6 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail)
void arch_remove_reservations(struct resource *avail)
{
- /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
if (avail->flags & IORESOURCE_MEM) {
- if (avail->start < BIOS_END)
- avail->start = BIOS_END;
resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
remove_e820_regions(avail);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 78a0e629892..41ead8d3bc0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -924,10 +924,10 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL32", 4)) {
+ EFI32_LOADER_SIGNATURE, 4)) {
set_bit(EFI_BOOT, &efi.flags);
} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
- "EL64", 4)) {
+ EFI64_LOADER_SIGNATURE, 4)) {
set_bit(EFI_BOOT, &efi.flags);
set_bit(EFI_64BIT, &efi.flags);
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index a0da58db43a..2851d63c120 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
/* Set up to return from userspace. */
restorer = current->mm->context.vdso +
- selected_vdso32->sym___kernel_sigreturn;
+ selected_vdso32->sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57e5ce126d5..b6025f9e36c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -234,9 +234,6 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
return ns;
}
-/* XXX surely we already have this someplace in the kernel?! */
-#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
-
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
unsigned long long tsc_now, ns_now;
@@ -259,7 +256,9 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+ data->cyc2ns_mul =
+ DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
+ cpu_khz);
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
@@ -920,9 +919,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu);
+ }
return 0;
}
@@ -951,7 +950,7 @@ core_initcall(cpufreq_tsc);
static struct clocksource clocksource_tsc;
/*
- * We compare the TSC to the cycle_last value in the clocksource
+ * We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
* very small window right after one CPU updated cycle_last under
* xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
@@ -961,26 +960,23 @@ static struct clocksource clocksource_tsc;
* due to the unsigned delta calculation of the time keeping core
* code, which is necessary to support wrapping clocksources like pm
* timer.
+ *
+ * This sanity check is now done in the core timekeeping code.
+ * checking the result of read_tsc() - cycle_last for being negative.
+ * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
*/
static cycle_t read_tsc(struct clocksource *cs)
{
- cycle_t ret = (cycle_t)get_cycles();
-
- return ret >= clocksource_tsc.cycle_last ?
- ret : clocksource_tsc.cycle_last;
-}
-
-static void resume_tsc(struct clocksource *cs)
-{
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
- clocksource_tsc.cycle_last = 0;
+ return (cycle_t)get_cycles();
}
+/*
+ * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
+ */
static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
- .resume = resume_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ea5b5709aa7..e1e1e80fc6a 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -81,10 +81,10 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
if (!show_unhandled_signals)
return;
- pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
- level, current->comm, task_pid_nr(current),
- message, regs->ip, regs->cs,
- regs->sp, regs->ax, regs->si, regs->di);
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index 9531fbb123b..c7d791f32b9 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,29 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
gtod_write_begin(vdata);
/* copy vsyscall data */
- vdata->vclock_mode = tk->clock->archdata.vclock_mode;
- vdata->cycle_last = tk->clock->cycle_last;
- vdata->mask = tk->clock->mask;
- vdata->mult = tk->mult;
- vdata->shift = tk->shift;
+ vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode;
+ vdata->cycle_last = tk->tkr.cycle_last;
+ vdata->mask = tk->tkr.mask;
+ vdata->mult = tk->tkr.mult;
+ vdata->shift = tk->tkr.shift;
vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->xtime_nsec;
+ vdata->wall_time_snsec = tk->tkr.xtime_nsec;
vdata->monotonic_time_sec = tk->xtime_sec
+ tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->xtime_nsec
+ vdata->monotonic_time_snsec = tk->tkr.xtime_nsec
+ ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->shift);
+ << tk->tkr.shift);
while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->shift)) {
+ (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->shift;
+ ((u64)NSEC_PER_SEC) << tk->tkr.shift;
vdata->monotonic_time_sec++;
}
vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift);
+ vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
+ tk->tkr.shift);
vdata->monotonic_time_coarse_sec =
vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c6add..940b142cc11 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -8,6 +8,7 @@
#include <linux/bootmem.h>
#include <linux/compat.h>
+#include <linux/cpu.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
@@ -24,7 +25,9 @@ u64 pcntxt_mask;
struct xsave_struct *init_xstate_buf;
static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+static unsigned int *xstate_offsets, *xstate_sizes;
+static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
+static unsigned int xstate_features;
/*
* If a processor implementation discern that a processor state component is
@@ -283,7 +286,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
if (use_xsave()) {
/* These bits must be zero. */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+ memset(xsave_hdr->reserved, 0, 48);
/*
* Init the state that is not present in the memory
@@ -479,6 +482,52 @@ static void __init setup_xstate_features(void)
}
/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ *
+ * Input: void
+ * Output: void
+ */
+void setup_xstate_comp(void)
+{
+ unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
+ int i;
+
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_comp_offsets[0] = 0;
+ xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
+
+ if (!cpu_has_xsaves) {
+ for (i = 2; i < xstate_features; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+ xstate_comp_offsets[i] = xstate_offsets[i];
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ }
+ }
+ return;
+ }
+
+ xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+ for (i = 2; i < xstate_features; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask))
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ else
+ xstate_comp_sizes[i] = 0;
+
+ if (i > 2)
+ xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+ + xstate_comp_sizes[i-1];
+
+ }
+}
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
@@ -496,15 +545,21 @@ static void __init setup_init_fpu_buf(void)
setup_xstate_features();
+ if (cpu_has_xsaves) {
+ init_xstate_buf->xsave_hdr.xcomp_bv =
+ (u64)1 << 63 | pcntxt_mask;
+ init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
+ }
+
/*
* Init all the features state with header_bv being 0x0
*/
- xrstor_state(init_xstate_buf, -1);
+ xrstor_state_booting(init_xstate_buf, -1);
/*
* Dump the init state again. This is to identify the init state
* of any feature which is not represented by all zero's.
*/
- xsave_state(init_xstate_buf, -1);
+ xsave_state_booting(init_xstate_buf, -1);
}
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
@@ -520,6 +575,30 @@ static int __init eager_fpu_setup(char *s)
}
__setup("eagerfpu=", eager_fpu_setup);
+
+/*
+ * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
+ */
+static void __init init_xstate_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ int i;
+
+ if (!cpu_has_xsaves) {
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ xstate_size = ebx;
+ return;
+ }
+
+ xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ for (i = 2; i < 64; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ xstate_size += eax;
+ }
+ }
+}
+
/*
* Enable and initialize the xsave feature.
*/
@@ -551,8 +630,7 @@ static void __init xstate_enable_boot_cpu(void)
/*
* Recompute the context size for enabled features
*/
- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xstate_size = ebx;
+ init_xstate_size();
update_regset_xstate_info(xstate_size, pcntxt_mask);
prepare_fx_sw_frame();
@@ -572,8 +650,9 @@ static void __init xstate_enable_boot_cpu(void)
}
}
- pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
- pcntxt_mask, xstate_size);
+ pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
+ pcntxt_mask, xstate_size,
+ cpu_has_xsaves ? "compacted form" : "standard form");
}
/*
@@ -635,3 +714,26 @@ void eager_fpu_init(void)
else
fxrstor_checking(&init_xstate_buf->i387);
}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Inputs:
+ * xsave: base address of the xsave area;
+ * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
+ * etc.)
+ * Output:
+ * address of the state in the xsave area.
+ */
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
+{
+ int feature = fls64(xstate) - 1;
+ if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
+ return NULL;
+
+ return (void *)xsave + xstate_comp_offsets[feature];
+}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 287e4c85fff..f9d16ff56c6 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,6 +27,7 @@ config KVM
select MMU_NOTIFIER
select ANON_INODES
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index f9087315e0c..a5380590ab0 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -95,4 +95,12 @@ static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
return best && (best->edx & bit(X86_FEATURE_GBPAGES));
}
+
+static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_RTM));
+}
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e4e833d3d7d..56657b0bb3b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -162,6 +162,10 @@
#define NoWrite ((u64)1 << 45) /* No writeback */
#define SrcWrite ((u64)1 << 46) /* Write back src operand */
#define NoMod ((u64)1 << 47) /* Mod field is ignored */
+#define Intercept ((u64)1 << 48) /* Has valid intercept field */
+#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
+#define NoBigReal ((u64)1 << 50) /* No big real mode */
+#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -426,6 +430,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
.modrm_reg = ctxt->modrm_reg,
.modrm_rm = ctxt->modrm_rm,
.src_val = ctxt->src.val64,
+ .dst_val = ctxt->dst.val64,
.src_bytes = ctxt->src.bytes,
.dst_bytes = ctxt->dst.bytes,
.ad_bytes = ctxt->ad_bytes,
@@ -511,12 +516,6 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
return desc->g ? (limit << 12) | 0xfff : limit;
}
-static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
-{
- ctxt->has_seg_override = true;
- ctxt->seg_override = seg;
-}
-
static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
{
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
@@ -525,14 +524,6 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
return ctxt->ops->get_cached_segment_base(ctxt, seg);
}
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
-{
- if (!ctxt->has_seg_override)
- return 0;
-
- return ctxt->seg_override;
-}
-
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
u32 error, bool valid)
{
@@ -651,7 +642,12 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (!fetch && (desc.type & 8) && !(desc.type & 2))
goto bad;
lim = desc_limit_scaled(&desc);
- if ((desc.type & 8) || !(desc.type & 4)) {
+ if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch &&
+ (ctxt->d & NoBigReal)) {
+ /* la is between zero and 0xffff */
+ if (la > 0xffff || (u32)(la + size - 1) > 0xffff)
+ goto bad;
+ } else if ((desc.type & 8) || !(desc.type & 4)) {
/* expand-up segment */
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad;
@@ -716,68 +712,71 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
}
/*
- * Fetch the next byte of the instruction being emulated which is pointed to
- * by ctxt->_eip, then increment ctxt->_eip.
- *
- * Also prefetch the remaining bytes of the instruction without crossing page
+ * Prefetch the remaining bytes of the instruction without crossing page
* boundary if they are not in fetch_cache yet.
*/
-static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
{
- struct fetch_cache *fc = &ctxt->fetch;
int rc;
- int size, cur_size;
-
- if (ctxt->_eip == fc->end) {
- unsigned long linear;
- struct segmented_address addr = { .seg = VCPU_SREG_CS,
- .ea = ctxt->_eip };
- cur_size = fc->end - fc->start;
- size = min(15UL - cur_size,
- PAGE_SIZE - offset_in_page(ctxt->_eip));
- rc = __linearize(ctxt, addr, size, false, true, &linear);
- if (unlikely(rc != X86EMUL_CONTINUE))
- return rc;
- rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
- size, &ctxt->exception);
- if (unlikely(rc != X86EMUL_CONTINUE))
- return rc;
- fc->end += size;
- }
- *dest = fc->data[ctxt->_eip - fc->start];
- ctxt->_eip++;
- return X86EMUL_CONTINUE;
-}
+ unsigned size;
+ unsigned long linear;
+ int cur_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->eip + cur_size };
+
+ size = 15UL ^ cur_size;
+ rc = __linearize(ctxt, addr, size, false, true, &linear);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- void *dest, unsigned size)
-{
- int rc;
+ size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
- /* x86 instructions are limited to 15 bytes. */
- if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
+ /*
+ * One instruction can only straddle two pages,
+ * and one has been loaded at the beginning of
+ * x86_decode_insn. So, if not enough bytes
+ * still, we must have hit the 15-byte boundary.
+ */
+ if (unlikely(size < op_size))
return X86EMUL_UNHANDLEABLE;
- while (size--) {
- rc = do_insn_fetch_byte(ctxt, dest++);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- }
+ rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
+ size, &ctxt->exception);
+ if (unlikely(rc != X86EMUL_CONTINUE))
+ return rc;
+ ctxt->fetch.end += size;
return X86EMUL_CONTINUE;
}
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+ unsigned size)
+{
+ if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size))
+ return __do_insn_fetch_bytes(ctxt, size);
+ else
+ return X86EMUL_CONTINUE;
+}
+
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _ctxt) \
-({ unsigned long _x; \
- rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \
+({ _type _x; \
+ \
+ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
- (_type)_x; \
+ ctxt->_eip += sizeof(_type); \
+ _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \
+ ctxt->fetch.ptr += sizeof(_type); \
+ _x; \
})
#define insn_fetch_arr(_arr, _size, _ctxt) \
-({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \
+({ \
+ rc = do_insn_fetch_bytes(_ctxt, _size); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
+ ctxt->_eip += (_size); \
+ memcpy(_arr, ctxt->fetch.ptr, _size); \
+ ctxt->fetch.ptr += (_size); \
})
/*
@@ -1063,19 +1062,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
u8 sib;
- int index_reg = 0, base_reg = 0, scale;
+ int index_reg, base_reg, scale;
int rc = X86EMUL_CONTINUE;
ulong modrm_ea = 0;
- if (ctxt->rex_prefix) {
- ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */
- index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
- ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
- }
+ ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
+ index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
+ base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
- ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
+ ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
- ctxt->modrm_rm |= (ctxt->modrm & 0x07);
+ ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
ctxt->modrm_seg = VCPU_SREG_DS;
if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
@@ -1093,7 +1090,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
if (ctxt->d & Mmx) {
op->type = OP_MM;
op->bytes = 8;
- op->addr.xmm = ctxt->modrm_rm & 7;
+ op->addr.mm = ctxt->modrm_rm & 7;
return rc;
}
fetch_register_operand(op);
@@ -1190,6 +1187,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
}
}
op->addr.mem.ea = modrm_ea;
+ if (ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
done:
return rc;
}
@@ -1220,12 +1220,14 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
long sv = 0, mask;
if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
- mask = ~(ctxt->dst.bytes * 8 - 1);
+ mask = ~((long)ctxt->dst.bytes * 8 - 1);
if (ctxt->src.bytes == 2)
sv = (s16)ctxt->src.val & (s16)mask;
else if (ctxt->src.bytes == 4)
sv = (s32)ctxt->src.val & (s32)mask;
+ else
+ sv = (s64)ctxt->src.val & (s64)mask;
ctxt->dst.addr.mem.ea += (sv >> 3);
}
@@ -1315,8 +1317,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
in_page = (ctxt->eflags & EFLG_DF) ?
offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
- n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
- count);
+ n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
if (n == 0)
n = 1;
rc->pos = rc->end = 0;
@@ -1358,17 +1359,19 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
u16 selector, struct desc_ptr *dt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
+ u32 base3 = 0;
if (selector & 1 << 2) {
struct desc_struct desc;
u16 sel;
memset (dt, 0, sizeof *dt);
- if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
+ if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+ VCPU_SREG_LDTR))
return;
dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
- dt->address = get_desc_base(&desc);
+ dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
} else
ops->get_gdt(ctxt, dt);
}
@@ -1422,6 +1425,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
ulong desc_addr;
int ret;
u16 dummy;
+ u32 base3 = 0;
memset(&seg_desc, 0, sizeof seg_desc);
@@ -1538,9 +1542,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
ret = write_segment_descriptor(ctxt, selector, &seg_desc);
if (ret != X86EMUL_CONTINUE)
return ret;
+ } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+ ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
+ sizeof(base3), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
}
load:
- ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
+ ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
return X86EMUL_CONTINUE;
exception:
emulate_exception(ctxt, err_vec, err_code, true);
@@ -1575,34 +1584,28 @@ static void write_register_operand(struct operand *op)
static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
{
- int rc;
-
switch (op->type) {
case OP_REG:
write_register_operand(op);
break;
case OP_MEM:
if (ctxt->lock_prefix)
- rc = segmented_cmpxchg(ctxt,
+ return segmented_cmpxchg(ctxt,
+ op->addr.mem,
+ &op->orig_val,
+ &op->val,
+ op->bytes);
+ else
+ return segmented_write(ctxt,
op->addr.mem,
- &op->orig_val,
&op->val,
op->bytes);
- else
- rc = segmented_write(ctxt,
- op->addr.mem,
- &op->val,
- op->bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
break;
case OP_MEM_STR:
- rc = segmented_write(ctxt,
- op->addr.mem,
- op->data,
- op->bytes * op->count);
- if (rc != X86EMUL_CONTINUE)
- return rc;
+ return segmented_write(ctxt,
+ op->addr.mem,
+ op->data,
+ op->bytes * op->count);
break;
case OP_XMM:
write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
@@ -1671,7 +1674,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
- | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
+ | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID;
switch(ctxt->mode) {
case X86EMUL_MODE_PROT64:
@@ -1754,6 +1757,9 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
+ if (ctxt->modrm_reg == VCPU_SREG_SS)
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
rc = load_segment_descriptor(ctxt, (u16)selector, seg);
return rc;
}
@@ -1991,6 +1997,9 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
{
u64 old = ctxt->dst.orig_val64;
+ if (ctxt->dst.bytes == 16)
+ return X86EMUL_UNHANDLEABLE;
+
if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
@@ -2017,6 +2026,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
unsigned long cs;
+ int cpl = ctxt->ops->cpl(ctxt);
rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
@@ -2026,6 +2036,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
+ /* Outer-privilege level return is not implemented */
+ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
+ return X86EMUL_UNHANDLEABLE;
rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
return rc;
}
@@ -2044,8 +2057,10 @@ static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
{
/* Save real source value, then compare EAX against destination. */
+ ctxt->dst.orig_val = ctxt->dst.val;
+ ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
ctxt->src.orig_val = ctxt->src.val;
- ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
fastop(ctxt, em_cmp);
if (ctxt->eflags & EFLG_ZF) {
@@ -2055,6 +2070,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
/* Failure: write the value we saw to EAX. */
ctxt->dst.type = OP_REG;
ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->dst.val = ctxt->dst.orig_val;
}
return X86EMUL_CONTINUE;
}
@@ -2194,7 +2210,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
*reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip;
if (efer & EFER_LMA) {
#ifdef CONFIG_X86_64
- *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF;
+ *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags;
ops->get_msr(ctxt,
ctxt->mode == X86EMUL_MODE_PROT64 ?
@@ -2202,14 +2218,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ctxt->_eip = msr_data;
ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
- ctxt->eflags &= ~(msr_data | EFLG_RF);
+ ctxt->eflags &= ~msr_data;
#endif
} else {
/* legacy mode */
ops->get_msr(ctxt, MSR_STAR, &msr_data);
ctxt->_eip = (u32)msr_data;
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
}
return X86EMUL_CONTINUE;
@@ -2258,7 +2274,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
break;
}
- ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
+ ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
cs_sel = (u16)msr_data;
cs_sel &= ~SELECTOR_RPL_MASK;
ss_sel = cs_sel + 8;
@@ -2964,7 +2980,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
static int em_mov(struct x86_emulate_ctxt *ctxt)
{
- memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
return X86EMUL_CONTINUE;
}
@@ -3221,7 +3237,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
static int em_smsw(struct x86_emulate_ctxt *ctxt)
{
- ctxt->dst.bytes = 2;
+ if (ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
return X86EMUL_CONTINUE;
}
@@ -3496,7 +3513,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
- (rcx > 3))
+ ctxt->ops->check_pmc(ctxt, rcx))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
@@ -3521,9 +3538,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
}
#define D(_y) { .flags = (_y) }
-#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
-#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
- .check_perm = (_p) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
#define N D(NotImpl)
#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
@@ -3532,10 +3549,10 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
#define II(_f, _e, _i) \
- { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
+ { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
#define IIP(_f, _e, _i, _p) \
- { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
- .check_perm = (_p) }
+ { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+ .intercept = x86_intercept_##_i, .check_perm = (_p) }
#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
#define D2bv(_f) D((_f) | ByteOp), D(_f)
@@ -3634,8 +3651,8 @@ static const struct opcode group6[] = {
};
static const struct group_dual group7 = { {
- II(Mov | DstMem | Priv, em_sgdt, sgdt),
- II(Mov | DstMem | Priv, em_sidt, sidt),
+ II(Mov | DstMem, em_sgdt, sgdt),
+ II(Mov | DstMem, em_sidt, sidt),
II(SrcMem | Priv, em_lgdt, lgdt),
II(SrcMem | Priv, em_lidt, lidt),
II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
@@ -3899,7 +3916,7 @@ static const struct opcode twobyte_table[256] = {
N, N,
N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */
- X16(D(DstReg | SrcMem | ModRM | Mov)),
+ X16(D(DstReg | SrcMem | ModRM)),
/* 0x50 - 0x5F */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0x60 - 0x6F */
@@ -4061,12 +4078,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
mem_common:
*op = ctxt->memop;
ctxt->memopp = op;
- if ((ctxt->d & BitOp) && op == &ctxt->dst)
+ if (ctxt->d & BitOp)
fetch_bit_operand(ctxt);
op->orig_val = op->val;
break;
case OpMem64:
- ctxt->memop.bytes = 8;
+ ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
goto mem_common;
case OpAcc:
op->type = OP_REG;
@@ -4150,7 +4167,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
- op->addr.mem.seg = seg_override(ctxt);
+ op->addr.mem.seg = ctxt->seg_override;
op->val = 0;
op->count = 1;
break;
@@ -4161,7 +4178,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
register_address(ctxt,
reg_read(ctxt, VCPU_REGS_RBX) +
(reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
- op->addr.mem.seg = seg_override(ctxt);
+ op->addr.mem.seg = ctxt->seg_override;
op->val = 0;
break;
case OpImmFAddr:
@@ -4208,16 +4225,22 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
int mode = ctxt->mode;
int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
bool op_prefix = false;
+ bool has_seg_override = false;
struct opcode opcode;
ctxt->memop.type = OP_NONE;
ctxt->memopp = NULL;
ctxt->_eip = ctxt->eip;
- ctxt->fetch.start = ctxt->_eip;
- ctxt->fetch.end = ctxt->fetch.start + insn_len;
+ ctxt->fetch.ptr = ctxt->fetch.data;
+ ctxt->fetch.end = ctxt->fetch.data + insn_len;
ctxt->opcode_len = 1;
if (insn_len > 0)
memcpy(ctxt->fetch.data, insn, insn_len);
+ else {
+ rc = __do_insn_fetch_bytes(ctxt, 1);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ }
switch (mode) {
case X86EMUL_MODE_REAL:
@@ -4261,11 +4284,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
case 0x2e: /* CS override */
case 0x36: /* SS override */
case 0x3e: /* DS override */
- set_seg_override(ctxt, (ctxt->b >> 3) & 3);
+ has_seg_override = true;
+ ctxt->seg_override = (ctxt->b >> 3) & 3;
break;
case 0x64: /* FS override */
case 0x65: /* GS override */
- set_seg_override(ctxt, ctxt->b & 7);
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->b & 7;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
@@ -4314,6 +4339,13 @@ done_prefixes:
if (ctxt->d & ModRM)
ctxt->modrm = insn_fetch(u8, ctxt);
+ /* vex-prefix instructions are not implemented */
+ if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
+ (mode == X86EMUL_MODE_PROT64 ||
+ (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) {
+ ctxt->d = NotImpl;
+ }
+
while (ctxt->d & GroupMask) {
switch (ctxt->d & GroupMask) {
case Group:
@@ -4356,49 +4388,59 @@ done_prefixes:
ctxt->d |= opcode.flags;
}
- ctxt->execute = opcode.u.execute;
- ctxt->check_perm = opcode.check_perm;
- ctxt->intercept = opcode.intercept;
-
/* Unrecognised? */
- if (ctxt->d == 0 || (ctxt->d & NotImpl))
+ if (ctxt->d == 0)
return EMULATION_FAILED;
- if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
- return EMULATION_FAILED;
+ ctxt->execute = opcode.u.execute;
- if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
- ctxt->op_bytes = 8;
+ if (unlikely(ctxt->d &
+ (NotImpl|EmulateOnUD|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) {
+ /*
+ * These are copied unconditionally here, and checked unconditionally
+ * in x86_emulate_insn.
+ */
+ ctxt->check_perm = opcode.check_perm;
+ ctxt->intercept = opcode.intercept;
+
+ if (ctxt->d & NotImpl)
+ return EMULATION_FAILED;
+
+ if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
+ return EMULATION_FAILED;
- if (ctxt->d & Op3264) {
- if (mode == X86EMUL_MODE_PROT64)
+ if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
ctxt->op_bytes = 8;
- else
- ctxt->op_bytes = 4;
- }
- if (ctxt->d & Sse)
- ctxt->op_bytes = 16;
- else if (ctxt->d & Mmx)
- ctxt->op_bytes = 8;
+ if (ctxt->d & Op3264) {
+ if (mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ else
+ ctxt->op_bytes = 4;
+ }
+
+ if (ctxt->d & Sse)
+ ctxt->op_bytes = 16;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
+ }
/* ModRM and SIB bytes. */
if (ctxt->d & ModRM) {
rc = decode_modrm(ctxt, &ctxt->memop);
- if (!ctxt->has_seg_override)
- set_seg_override(ctxt, ctxt->modrm_seg);
+ if (!has_seg_override) {
+ has_seg_override = true;
+ ctxt->seg_override = ctxt->modrm_seg;
+ }
} else if (ctxt->d & MemAbs)
rc = decode_abs(ctxt, &ctxt->memop);
if (rc != X86EMUL_CONTINUE)
goto done;
- if (!ctxt->has_seg_override)
- set_seg_override(ctxt, VCPU_SREG_DS);
-
- ctxt->memop.addr.mem.seg = seg_override(ctxt);
+ if (!has_seg_override)
+ ctxt->seg_override = VCPU_SREG_DS;
- if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
- ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+ ctxt->memop.addr.mem.seg = ctxt->seg_override;
/*
* Decode and fetch the source operand: register, memory
@@ -4420,7 +4462,7 @@ done_prefixes:
rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
done:
- if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
+ if (ctxt->rip_relative)
ctxt->memopp->addr.mem.ea += ctxt->_eip;
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
@@ -4495,6 +4537,16 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
return X86EMUL_CONTINUE;
}
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+ memset(&ctxt->rip_relative, 0,
+ (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+
+ ctxt->io_read.pos = 0;
+ ctxt->io_read.end = 0;
+ ctxt->mem_read.end = 0;
+}
+
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
@@ -4503,12 +4555,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
ctxt->mem_read.pos = 0;
- if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
- (ctxt->d & Undefined)) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
/* LOCK prefix is allowed only with some instructions */
if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
rc = emulate_ud(ctxt);
@@ -4520,69 +4566,82 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
- || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
- if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
- rc = emulate_nm(ctxt);
- goto done;
- }
+ if (unlikely(ctxt->d &
+ (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+ if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+ (ctxt->d & Undefined)) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
- if (ctxt->d & Mmx) {
- rc = flush_pending_x87_faults(ctxt);
- if (rc != X86EMUL_CONTINUE)
+ if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+ || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ rc = emulate_ud(ctxt);
goto done;
- /*
- * Now that we know the fpu is exception safe, we can fetch
- * operands from it.
- */
- fetch_possible_mmx_operand(ctxt, &ctxt->src);
- fetch_possible_mmx_operand(ctxt, &ctxt->src2);
- if (!(ctxt->d & Mov))
- fetch_possible_mmx_operand(ctxt, &ctxt->dst);
- }
+ }
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_PRE_EXCEPT);
- if (rc != X86EMUL_CONTINUE)
+ if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ rc = emulate_nm(ctxt);
goto done;
- }
+ }
- /* Privileged instruction can be executed only in CPL=0 */
- if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
- rc = emulate_gp(ctxt, 0);
- goto done;
- }
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(ctxt, &ctxt->src);
+ fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+ }
- /* Instruction can only be executed in protected mode */
- if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
- rc = emulate_ud(ctxt);
- goto done;
- }
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_PRE_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
- /* Do instruction specific permission checks */
- if (ctxt->check_perm) {
- rc = ctxt->check_perm(ctxt);
- if (rc != X86EMUL_CONTINUE)
+ /* Privileged instruction can be executed only in CPL=0 */
+ if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+ if (ctxt->d & PrivUD)
+ rc = emulate_ud(ctxt);
+ else
+ rc = emulate_gp(ctxt, 0);
goto done;
- }
+ }
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
- rc = emulator_check_intercept(ctxt, ctxt->intercept,
- X86_ICPT_POST_EXCEPT);
- if (rc != X86EMUL_CONTINUE)
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
goto done;
- }
+ }
- if (ctxt->rep_prefix && (ctxt->d & String)) {
- /* All REP prefixes have the same first termination condition */
- if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
- ctxt->eip = ctxt->_eip;
- goto done;
+ /* Do instruction specific permission checks */
+ if (ctxt->d & CheckPerm) {
+ rc = ctxt->check_perm(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ rc = emulator_check_intercept(ctxt, ctxt->intercept,
+ X86_ICPT_POST_EXCEPT);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
+
+ if (ctxt->rep_prefix && (ctxt->d & String)) {
+ /* All REP prefixes have the same first termination condition */
+ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ ctxt->eip = ctxt->_eip;
+ ctxt->eflags &= ~EFLG_RF;
+ goto done;
+ }
}
}
@@ -4616,13 +4675,18 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+ if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
goto done;
}
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ ctxt->eflags |= EFLG_RF;
+ else
+ ctxt->eflags &= ~EFLG_RF;
+
if (ctxt->execute) {
if (ctxt->d & Fastop) {
void (*fop)(struct fastop *) = (void *)ctxt->execute;
@@ -4657,8 +4721,9 @@ special_insn:
break;
case 0x90 ... 0x97: /* nop / xchg reg, rax */
if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
- break;
- rc = em_xchg(ctxt);
+ ctxt->dst.type = OP_NONE;
+ else
+ rc = em_xchg(ctxt);
break;
case 0x98: /* cbw/cwde/cdqe */
switch (ctxt->op_bytes) {
@@ -4709,17 +4774,17 @@ special_insn:
goto done;
writeback:
- if (!(ctxt->d & NoWrite)) {
- rc = writeback(ctxt, &ctxt->dst);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- }
if (ctxt->d & SrcWrite) {
BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
rc = writeback(ctxt, &ctxt->src);
if (rc != X86EMUL_CONTINUE)
goto done;
}
+ if (!(ctxt->d & NoWrite)) {
+ rc = writeback(ctxt, &ctxt->dst);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ }
/*
* restore dst type in case the decoding will be reused
@@ -4761,6 +4826,7 @@ writeback:
}
goto done; /* skip rip writeback */
}
+ ctxt->eflags &= ~EFLG_RF;
}
ctxt->eip = ctxt->_eip;
@@ -4793,8 +4859,10 @@ twobyte_insn:
ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
break;
case 0x40 ... 0x4f: /* cmov */
- ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
- if (!test_cc(ctxt->b, ctxt->eflags))
+ if (test_cc(ctxt->b, ctxt->eflags))
+ ctxt->dst.val = ctxt->src.val;
+ else if (ctxt->mode != X86EMUL_MODE_PROT64 ||
+ ctxt->op_bytes != 4)
ctxt->dst.type = OP_NONE; /* no writeback */
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
@@ -4818,8 +4886,8 @@ twobyte_insn:
break;
case 0xc3: /* movnti */
ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
- (u64) ctxt->src.val;
+ ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val :
+ (u32) ctxt->src.val;
break;
default:
goto cannot_emulate;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index bd0da433e6d..a1ec6a50a05 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -108,7 +108,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
vector = kvm_cpu_get_extint(v);
- if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
+ if (vector != -1)
return vector; /* PIC */
return kvm_get_apic_interrupt(v); /* APIC */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 00691185817..08e8a899e00 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -352,25 +352,46 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
- apic->irr_pending = false;
+ struct kvm_vcpu *vcpu;
+
+ vcpu = apic->vcpu;
+
apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (apic_search_irr(apic) != -1)
- apic->irr_pending = true;
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ /* try to update RVI */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ else {
+ vec = apic_search_irr(apic);
+ apic->irr_pending = (vec != -1);
+ }
}
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- /* Note that we never get here with APIC virtualization enabled. */
+ struct kvm_vcpu *vcpu;
+
+ if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
- if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
- ++apic->isr_count;
- BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
/*
- * ISR (in service register) bit is set when injecting an interrupt.
- * The highest vector is injected. Thus the latest bit set matches
- * the highest bit in ISR.
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
*/
- apic->highest_isr_cache = vec;
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
}
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -1451,7 +1472,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
- apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
+ apic_debug("%s: vcpu=%p, id=%d, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
vcpu, kvm_apic_id(apic),
vcpu->arch.apic_base, apic->base_address);
@@ -1627,11 +1648,16 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
- /* Note that we never get here with APIC virtualization enabled. */
-
if (vector == -1)
return -1;
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
+
apic_set_isr(vector, apic);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
@@ -1895,7 +1921,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
/* evaluate pending_events before reading the vector */
smp_rmb();
sipi_vector = apic->sipi_vector;
- pr_debug("vcpu %d received sipi with vector # %x\n",
+ apic_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, sipi_vector);
kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 1185fe7a7f4..9ade5cfb5a4 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -273,7 +273,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp)
int ret;
unsigned long enable;
- ret = strict_strtoul(val, 10, &enable);
+ ret = kstrtoul(val, 10, &enable);
if (ret < 0)
return -EINVAL;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 9d2e0ffcb19..5aaf3564176 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -22,7 +22,7 @@
__entry->unsync = sp->unsync;
#define KVM_MMU_PAGE_PRINTK() ({ \
- const char *ret = p->buffer + p->len; \
+ const u32 saved_len = p->len; \
static const char *access_str[] = { \
"---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \
}; \
@@ -41,7 +41,7 @@
role.nxe ? "" : "!", \
__entry->root_count, \
__entry->unsync ? "unsync" : "sync", 0); \
- ret; \
+ p->buffer + saved_len; \
})
#define kvm_mmu_trace_pferr_flags \
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index cbecaa90399..3dd6accb64e 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -428,6 +428,15 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
+int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ bool fixed = pmc & (1u << 30);
+ pmc &= ~(3u << 30);
+ return (!fixed && pmc >= pmu->nr_arch_gp_counters) ||
+ (fixed && pmc >= pmu->nr_arch_fixed_counters);
+}
+
int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
{
struct kvm_pmu *pmu = &vcpu->arch.pmu;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ec8366c5cfe..ddf742768ec 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -486,14 +486,14 @@ static int is_external_interrupt(u32 info)
return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
}
-static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 ret = 0;
if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
- ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
- return ret & mask;
+ ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
+ return ret;
}
static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
@@ -1415,7 +1415,16 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
- var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+ /*
+ * AMD CPUs circa 2014 track the G bit for all segments except CS.
+ * However, the SVM spec states that the G bit is not observed by the
+ * CPU, and some VMware virtual CPUs drop the G bit for all segments.
+ * So let's synthesize a legal G bit for all segments, this helps
+ * running KVM nested. It also helps cross-vendor migration, because
+ * Intel's vmentry has a check on the 'G' bit.
+ */
+ var->g = s->limit > 0xfffff;
/*
* AMD's VMCB does not have an explicit unusable field, so emulate it
@@ -1424,14 +1433,6 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->unusable = !var->present || (var->type == 0);
switch (seg) {
- case VCPU_SREG_CS:
- /*
- * SVM always stores 0 for the 'G' bit in the CS selector in
- * the VMCB on a VMEXIT. This hurts cross-vendor migration:
- * Intel's VMENTRY has a check on the 'G' bit.
- */
- var->g = s->limit > 0xfffff;
- break;
case VCPU_SREG_TR:
/*
* Work around a bug where the busy flag in the tr selector
@@ -1462,6 +1463,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
*/
if (var->unusable)
var->db = 0;
+ var->dpl = to_svm(vcpu)->vmcb->save.cpl;
break;
}
}
@@ -2115,22 +2117,27 @@ static void nested_svm_unmap(struct page *page)
static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
- unsigned port;
- u8 val, bit;
+ unsigned port, size, iopm_len;
+ u16 val, mask;
+ u8 start_bit;
u64 gpa;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
+ size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
+ SVM_IOIO_SIZE_SHIFT;
gpa = svm->nested.vmcb_iopm + (port / 8);
- bit = port % 8;
- val = 0;
+ start_bit = port % 8;
+ iopm_len = (start_bit + size > 8) ? 2 : 1;
+ mask = (0xf >> (4 - size)) << start_bit;
+ val = 0;
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
- val &= (1 << bit);
+ if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
+ return NESTED_EXIT_DONE;
- return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+ return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
@@ -4204,7 +4211,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
if (info->intercept == x86_intercept_cr_write)
icpt_info.exit_code += info->modrm_reg;
- if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
+ if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
+ info->intercept == x86_intercept_clts)
break;
intercept = svm->nested.intercept;
@@ -4249,14 +4257,14 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
u64 exit_info;
u32 bytes;
- exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
-
if (info->intercept == x86_intercept_in ||
info->intercept == x86_intercept_ins) {
- exit_info |= SVM_IOIO_TYPE_MASK;
- bytes = info->src_bytes;
- } else {
+ exit_info = ((info->src_val & 0xffff) << 16) |
+ SVM_IOIO_TYPE_MASK;
bytes = info->dst_bytes;
+ } else {
+ exit_info = (info->dst_val & 0xffff) << 16;
+ bytes = info->src_bytes;
}
if (info->intercept == x86_intercept_outs ||
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 33574c95220..e850a7d332b 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -721,10 +721,10 @@ TRACE_EVENT(kvm_emulate_insn,
),
TP_fast_assign(
- __entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
- __entry->len = vcpu->arch.emulate_ctxt._eip
- - vcpu->arch.emulate_ctxt.fetch.start;
+ __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr
+ - vcpu->arch.emulate_ctxt.fetch.data;
+ __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len;
memcpy(__entry->insn,
vcpu->arch.emulate_ctxt.fetch.data,
15);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 801332edefc..bfe11cf124a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -383,6 +383,9 @@ struct nested_vmx {
struct hrtimer preemption_timer;
bool preemption_timer_expired;
+
+ /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
+ u64 vmcs01_debugctl;
};
#define POSTED_INTR_ON 0
@@ -740,7 +743,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
-static bool vmx_mpx_supported(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -820,7 +822,6 @@ static const u32 vmx_msr_index[] = {
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
-#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
static inline bool is_page_fault(u32 intr_info)
{
@@ -1940,7 +1941,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
}
-static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
int ret = 0;
@@ -1950,7 +1951,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
ret |= KVM_X86_SHADOW_INT_MOV_SS;
- return ret & mask;
+ return ret;
}
static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
@@ -2239,10 +2240,13 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
* or other means.
*/
static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
+static u32 nested_vmx_true_procbased_ctls_low;
static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
+static u32 nested_vmx_true_exit_ctls_low;
static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_true_entry_ctls_low;
static u32 nested_vmx_misc_low, nested_vmx_misc_high;
static u32 nested_vmx_ept_caps;
static __init void nested_vmx_setup_ctls_msrs(void)
@@ -2265,21 +2269,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
- /*
- * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
- * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
- */
nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- /*
- * Exit controls
- * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
- * 17 must be 1.
- */
+ /* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
@@ -2296,10 +2292,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
if (vmx_mpx_supported())
nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ /* We support free control of debug control saving. */
+ nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+ ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
- /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_entry_ctls_high &=
#ifdef CONFIG_X86_64
@@ -2311,10 +2310,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
if (vmx_mpx_supported())
nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ /* We support free control of debug control loading. */
+ nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+ ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
- nested_vmx_procbased_ctls_low = 0;
+ nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
nested_vmx_procbased_ctls_high &=
CPU_BASED_VIRTUAL_INTR_PENDING |
CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
@@ -2335,7 +2338,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
* can use it to avoid exits to L1 - even when L0 runs L2
* without MSR bitmaps.
*/
- nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
+ nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ CPU_BASED_USE_MSR_BITMAPS;
+
+ /* We support free control of CR3 access interception. */
+ nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+ ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/* secondary cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
@@ -2394,7 +2402,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- *pdata = VMCS12_REVISION |
+ *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
break;
@@ -2404,16 +2412,25 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
nested_vmx_pinbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
+ nested_vmx_procbased_ctls_high);
+ break;
case MSR_IA32_VMX_PROCBASED_CTLS:
*pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
+ nested_vmx_exit_ctls_high);
+ break;
case MSR_IA32_VMX_EXIT_CTLS:
*pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
+ nested_vmx_entry_ctls_high);
+ break;
case MSR_IA32_VMX_ENTRY_CTLS:
*pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
nested_vmx_entry_ctls_high);
@@ -2442,7 +2459,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = -1ULL;
break;
case MSR_IA32_VMX_VMCS_ENUM:
- *pdata = 0x1f;
+ *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
*pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
@@ -3653,7 +3670,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
out:
- vmx->emulation_required |= emulation_required(vcpu);
+ vmx->emulation_required = emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -4422,7 +4439,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmx->vcpu.arch.pat = host_pat;
}
- for (i = 0; i < NR_VMX_MSR; ++i) {
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
u32 index = vmx_msr_index[i];
u32 data_low, data_high;
int j = vmx->nmsrs;
@@ -4873,7 +4890,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= dr6;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
if (!(dr6 & ~DR6_RESERVED)) /* icebp */
skip_emulated_instruction(vcpu);
@@ -5039,7 +5056,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
- val = kvm_register_read(vcpu, reg);
+ val = kvm_register_readl(vcpu, reg);
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
@@ -5056,7 +5073,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
- u8 cr8 = kvm_register_read(vcpu, reg);
+ u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
kvm_complete_insn_gp(vcpu, err);
if (irqchip_in_kernel(vcpu->kvm))
@@ -5132,7 +5149,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
return 0;
} else {
vcpu->arch.dr7 &= ~DR7_GD;
- vcpu->arch.dr6 |= DR6_BD;
+ vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
kvm_queue_exception(vcpu, DB_VECTOR);
return 1;
@@ -5165,7 +5182,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
return 1;
kvm_register_write(vcpu, reg, val);
} else
- if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)))
+ if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
return 1;
skip_emulated_instruction(vcpu);
@@ -5621,7 +5638,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
- while (!guest_state_valid(vcpu) && count-- != 0) {
+ while (vmx->emulation_required && count-- != 0) {
if (intr_window_requested && vmx_interrupt_allowed(vcpu))
return handle_interrupt_window(&vmx->vcpu);
@@ -5655,7 +5672,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
schedule();
}
- vmx->emulation_required = emulation_required(vcpu);
out:
return ret;
}
@@ -5754,22 +5770,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
/*
* Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
- * currently used, if running L2), and vmcs01 when running L2.
+ * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
+ * must be &vmx->vmcs01.
*/
static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
{
struct vmcs02_list *item, *n;
+
+ WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- if (vmx->loaded_vmcs != &item->vmcs02)
- free_loaded_vmcs(&item->vmcs02);
+ /*
+ * Something will leak if the above WARN triggers. Better than
+ * a use-after-free.
+ */
+ if (vmx->loaded_vmcs == &item->vmcs02)
+ continue;
+
+ free_loaded_vmcs(&item->vmcs02);
list_del(&item->list);
kfree(item);
+ vmx->nested.vmcs02_num--;
}
- vmx->nested.vmcs02_num = 0;
-
- if (vmx->loaded_vmcs != &vmx->vmcs01)
- free_loaded_vmcs(&vmx->vmcs01);
}
/*
@@ -5918,7 +5939,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
* which replaces physical address width with 32
*
*/
- if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failInvalid(vcpu);
skip_emulated_instruction(vcpu);
return 1;
@@ -5936,7 +5957,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
vmx->nested.vmxon_ptr = vmptr;
break;
case EXIT_REASON_VMCLEAR:
- if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_INVALID_ADDRESS);
skip_emulated_instruction(vcpu);
@@ -5951,7 +5972,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
}
break;
case EXIT_REASON_VMPTRLD:
- if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) {
+ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INVALID_ADDRESS);
skip_emulated_instruction(vcpu);
@@ -6086,20 +6107,27 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
u32 exec_control;
+ if (vmx->nested.current_vmptr == -1ull)
+ return;
+
+ /* current_vmptr and current_vmcs12 are always set/reset together */
+ if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
+ return;
+
if (enable_shadow_vmcs) {
- if (vmx->nested.current_vmcs12 != NULL) {
- /* copy to memory all shadowed fields in case
- they were modified */
- copy_shadow_to_vmcs12(vmx);
- vmx->nested.sync_shadow_vmcs = false;
- exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
- exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
- vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
- }
+ /* copy to memory all shadowed fields in case
+ they were modified */
+ copy_shadow_to_vmcs12(vmx);
+ vmx->nested.sync_shadow_vmcs = false;
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
kunmap(vmx->nested.current_vmcs12_page);
nested_release_page(vmx->nested.current_vmcs12_page);
+ vmx->nested.current_vmptr = -1ull;
+ vmx->nested.current_vmcs12 = NULL;
}
/*
@@ -6110,12 +6138,9 @@ static void free_nested(struct vcpu_vmx *vmx)
{
if (!vmx->nested.vmxon)
return;
+
vmx->nested.vmxon = false;
- if (vmx->nested.current_vmptr != -1ull) {
- nested_release_vmcs12(vmx);
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
- }
+ nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
/* Unpin physical memory we referred to in current vmcs02 */
@@ -6152,11 +6177,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
return 1;
- if (vmptr == vmx->nested.current_vmptr) {
+ if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vmx);
- vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
- }
page = nested_get_page(vcpu, vmptr);
if (page == NULL) {
@@ -6384,7 +6406,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
return 1;
/* Decode instruction info and find the field to read */
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
if (!vmcs12_read_any(vcpu, field, &field_value)) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
@@ -6397,7 +6419,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
* on the guest's mode (32 or 64 bit), not on the given field's length.
*/
if (vmx_instruction_info & (1u << 10)) {
- kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+ kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
field_value);
} else {
if (get_vmx_mem_address(vcpu, exit_qualification,
@@ -6434,21 +6456,21 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
if (vmx_instruction_info & (1u << 10))
- field_value = kvm_register_read(vcpu,
+ field_value = kvm_register_readl(vcpu,
(((vmx_instruction_info) >> 3) & 0xf));
else {
if (get_vmx_mem_address(vcpu, exit_qualification,
vmx_instruction_info, &gva))
return 1;
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
- &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
+ &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
}
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
if (vmcs_field_readonly(field)) {
nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
@@ -6498,9 +6520,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
return 1;
}
- if (vmx->nested.current_vmptr != -1ull)
- nested_release_vmcs12(vmx);
+ nested_release_vmcs12(vmx);
vmx->nested.current_vmptr = vmptr;
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
@@ -6571,7 +6592,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
}
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+ type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
@@ -6751,7 +6772,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
int cr = exit_qualification & 15;
int reg = (exit_qualification >> 8) & 15;
- unsigned long val = kvm_register_read(vcpu, reg);
+ unsigned long val = kvm_register_readl(vcpu, reg);
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
@@ -7112,7 +7133,26 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
if (max_irr == -1)
return;
- vmx_set_rvi(max_irr);
+ /*
+ * If a vmexit is needed, vmx_check_nested_events handles it.
+ */
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return;
+
+ if (!is_guest_mode(vcpu)) {
+ vmx_set_rvi(max_irr);
+ return;
+ }
+
+ /*
+ * Fall back to pre-APICv interrupt injection since L2
+ * is run without virtual interrupt delivery.
+ */
+ if (!kvm_event_needs_reinjection(vcpu) &&
+ vmx_interrupt_allowed(vcpu)) {
+ kvm_queue_interrupt(vcpu, max_irr, false);
+ vmx_inject_irq(vcpu);
+ }
}
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -7520,13 +7560,31 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_complete_interrupts(vmx);
}
+static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int cpu;
+
+ if (vmx->loaded_vmcs == &vmx->vmcs01)
+ return;
+
+ cpu = get_cpu();
+ vmx->loaded_vmcs = &vmx->vmcs01;
+ vmx_vcpu_put(vcpu);
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ put_cpu();
+}
+
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
free_vpid(vmx);
- free_loaded_vmcs(vmx->loaded_vmcs);
+ leave_guest_mode(vcpu);
+ vmx_load_vmcs01(vcpu);
free_nested(vmx);
+ free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -7548,6 +7606,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vcpu;
vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
+ > PAGE_SIZE);
+
err = -ENOMEM;
if (!vmx->guest_msrs) {
goto uninit_vcpu;
@@ -7836,7 +7897,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+ } else {
+ kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
+ }
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
vmcs12->vm_entry_intr_info_field);
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
@@ -7846,7 +7913,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
- kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
vmcs12->guest_pending_dbg_exceptions);
@@ -8113,14 +8179,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
}
if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
- !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
+ !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
- !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
+ !PAGE_ALIGNED(vmcs12->apic_access_addr)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8136,15 +8202,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
}
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
+ nested_vmx_true_procbased_ctls_low,
+ nested_vmx_procbased_ctls_high) ||
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
+ nested_vmx_true_exit_ctls_low,
+ nested_vmx_exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
+ nested_vmx_true_entry_ctls_low,
+ nested_vmx_entry_ctls_high))
{
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8221,6 +8290,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
+ if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+
cpu = get_cpu();
vmx->loaded_vmcs = vmcs02;
vmx_vcpu_put(vcpu);
@@ -8398,7 +8470,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
- kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
@@ -8477,9 +8548,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
+ if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
+ kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+ vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+ }
+
/* TODO: These cannot have changed unless we have MSR bitmaps and
* the relevant bit asks not to trap the change */
- vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
@@ -8670,7 +8745,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
unsigned long exit_qualification)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int cpu;
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
/* trying to cancel vmlaunch/vmresume is a bug */
@@ -8680,6 +8754,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
+ vmx_load_vmcs01(vcpu);
+
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
&& nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
@@ -8695,13 +8771,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
- cpu = get_cpu();
- vmx->loaded_vmcs = &vmx->vmcs01;
- vmx_vcpu_put(vcpu);
- vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
- put_cpu();
-
vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
vmx_segment_cache_clear(vmx);
@@ -8890,7 +8959,7 @@ static int __init vmx_init(void)
rdmsrl_safe(MSR_EFER, &host_efer);
- for (i = 0; i < NR_VMX_MSR; ++i)
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
kvm_define_shared_msr(i, vmx_msr_index[i]);
vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f32a02578c0..8f1e22d3b28 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -87,6 +87,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -211,6 +212,7 @@ static void shared_msr_update(unsigned slot, u32 msr)
void kvm_define_shared_msr(unsigned slot, u32 msr)
{
+ BUG_ON(slot >= KVM_NR_SHARED_MSRS);
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
shared_msrs_global.msrs[slot] = msr;
@@ -310,6 +312,31 @@ static int exception_class(int vector)
return EXCPT_BENIGN;
}
+#define EXCPT_FAULT 0
+#define EXCPT_TRAP 1
+#define EXCPT_ABORT 2
+#define EXCPT_INTERRUPT 3
+
+static int exception_type(int vector)
+{
+ unsigned int mask;
+
+ if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
+ return EXCPT_INTERRUPT;
+
+ mask = 1 << vector;
+
+ /* #DB is trap, as instruction watchpoints are handled elsewhere */
+ if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ return EXCPT_TRAP;
+
+ if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
+ return EXCPT_ABORT;
+
+ /* Reserved exceptions will result in fault */
+ return EXCPT_FAULT;
+}
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool reinject)
@@ -758,6 +785,15 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
+static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
+{
+ u64 fixed = DR6_FIXED_1;
+
+ if (!guest_cpuid_has_rtm(vcpu))
+ fixed |= DR6_RTM;
+ return fixed;
+}
+
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
@@ -773,7 +809,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 6:
if (val & 0xffffffff00000000ULL)
return -1; /* #GP */
- vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
kvm_update_dr6(vcpu);
break;
case 5:
@@ -984,9 +1020,8 @@ struct pvclock_gtod_data {
u32 shift;
} clock;
- /* open coded 'struct timespec' */
- u64 monotonic_time_snsec;
- time_t monotonic_time_sec;
+ u64 boot_ns;
+ u64 nsec_base;
};
static struct pvclock_gtod_data pvclock_gtod_data;
@@ -994,27 +1029,21 @@ static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+ u64 boot_ns;
+
+ boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
- vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
- vdata->clock.cycle_last = tk->clock->cycle_last;
- vdata->clock.mask = tk->clock->mask;
- vdata->clock.mult = tk->mult;
- vdata->clock.shift = tk->shift;
-
- vdata->monotonic_time_sec = tk->xtime_sec
- + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->xtime_nsec
- + (tk->wall_to_monotonic.tv_nsec
- << tk->shift);
- while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->shift)) {
- vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->shift;
- vdata->monotonic_time_sec++;
- }
+ vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->tkr.cycle_last;
+ vdata->clock.mask = tk->tkr.mask;
+ vdata->clock.mult = tk->tkr.mult;
+ vdata->clock.shift = tk->tkr.shift;
+
+ vdata->boot_ns = boot_ns;
+ vdata->nsec_base = tk->tkr.xtime_nsec;
write_seqcount_end(&vdata->seq);
}
@@ -1109,11 +1138,7 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
static inline u64 get_kernel_ns(void)
{
- struct timespec ts;
-
- ktime_get_ts(&ts);
- monotonic_to_bootbased(&ts);
- return timespec_to_ns(&ts);
+ return ktime_get_boot_ns();
}
#ifdef CONFIG_X86_64
@@ -1215,6 +1240,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
unsigned long flags;
s64 usdiff;
bool matched;
+ bool already_matched;
u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
@@ -1279,6 +1305,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
+ already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
} else {
/*
* We split periods of matched TSC writes into generations.
@@ -1294,7 +1321,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm->arch.cur_tsc_write = data;
kvm->arch.cur_tsc_offset = offset;
matched = false;
- pr_debug("kvm: new tsc generation %u, clock %llu\n",
+ pr_debug("kvm: new tsc generation %llu, clock %llu\n",
kvm->arch.cur_tsc_generation, data);
}
@@ -1319,10 +1346,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
- if (matched)
- kvm->arch.nr_vcpus_matched_tsc++;
- else
+ if (!matched) {
kvm->arch.nr_vcpus_matched_tsc = 0;
+ } else if (!already_matched) {
+ kvm->arch.nr_vcpus_matched_tsc++;
+ }
kvm_track_tsc_matching(vcpu);
spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1375,23 +1403,22 @@ static inline u64 vgettsc(cycle_t *cycle_now)
return v * gtod->clock.mult;
}
-static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
- u64 ns;
int mode;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ u64 ns;
- ts->tv_nsec = 0;
do {
seq = read_seqcount_begin(&gtod->seq);
mode = gtod->clock.vclock_mode;
- ts->tv_sec = gtod->monotonic_time_sec;
- ns = gtod->monotonic_time_snsec;
+ ns = gtod->nsec_base;
ns += vgettsc(cycle_now);
ns >>= gtod->clock.shift;
+ ns += gtod->boot_ns;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
- timespec_add_ns(ts, ns);
+ *t = ns;
return mode;
}
@@ -1399,19 +1426,11 @@ static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
/* returns true if host is using tsc clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
{
- struct timespec ts;
-
/* checked again under seqlock below */
if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
return false;
- if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
- return false;
-
- monotonic_to_bootbased(&ts);
- *kernel_ns = timespec_to_ns(&ts);
-
- return true;
+ return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
}
#endif
@@ -1898,7 +1917,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
break;
gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(kvm, data,
+ if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
&tsc_ref, sizeof(tsc_ref)))
return 1;
mark_page_dirty(kvm, gfn);
@@ -2032,6 +2051,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
+ data &= ~(u64)0x40000; /* ignore Mc status write enable */
if (data != 0) {
vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
@@ -2616,7 +2636,7 @@ out:
return r;
}
-int kvm_dev_ioctl_check_extension(long ext)
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -2974,9 +2994,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
- events->interrupt.shadow =
- kvm_x86_ops->get_interrupt_shadow(vcpu,
- KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
+ events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
@@ -4082,7 +4100,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
+ ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
+ offset, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
@@ -4103,10 +4122,24 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ unsigned offset;
+ int ret;
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK,
- exception);
+ /* Inline kvm_read_guest_virt_helper for speed. */
+ gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
+ exception);
+ if (unlikely(gpa == UNMAPPED_GVA))
+ return X86EMUL_PROPAGATE_FAULT;
+
+ offset = addr & (PAGE_SIZE-1);
+ if (WARN_ON(offset + bytes > PAGE_SIZE))
+ bytes = (unsigned)PAGE_SIZE - offset;
+ ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
+ if (unlikely(ret < 0))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
}
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
@@ -4730,7 +4763,6 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
if (desc->g)
var.limit = (var.limit << 12) | 0xfff;
var.type = desc->type;
- var.present = desc->p;
var.dpl = desc->dpl;
var.db = desc->d;
var.s = desc->s;
@@ -4762,6 +4794,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
+static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc)
+{
+ return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
+}
+
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc, u64 *pdata)
{
@@ -4838,6 +4876,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_dr = emulator_set_dr,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
+ .check_pmc = emulator_check_pmc,
.read_pmc = emulator_read_pmc,
.halt = emulator_halt,
.wbinvd = emulator_wbinvd,
@@ -4850,7 +4889,7 @@ static const struct x86_emulate_ops emulate_ops = {
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -4858,8 +4897,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
* means that the last instruction is an sti. We should not
* leave the flag on in this case. The same goes for mov ss
*/
- if (!(int_shadow & mask))
+ if (int_shadow & mask)
+ mask = 0;
+ if (unlikely(int_shadow || mask)) {
kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+ if (!mask)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
}
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
@@ -4874,19 +4918,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
kvm_queue_exception(vcpu, ctxt->exception.vector);
}
-static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
-{
- memset(&ctxt->opcode_len, 0,
- (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
-
- ctxt->fetch.start = 0;
- ctxt->fetch.end = 0;
- ctxt->io_read.pos = 0;
- ctxt->io_read.end = 0;
- ctxt->mem_read.pos = 0;
- ctxt->mem_read.end = 0;
-}
-
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -5085,23 +5116,22 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
return dr6;
}
-static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
+static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
{
struct kvm_run *kvm_run = vcpu->run;
/*
- * Use the "raw" value to see if TF was passed to the processor.
- * Note that the new value of the flags has not been saved yet.
+ * rflags is the old, "raw" value of the flags. The new value has
+ * not been saved yet.
*
* This is correct even for TF set by the guest, because "the
* processor will not generate this exception after the instruction
* that sets the TF flag".
*/
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
-
if (unlikely(rflags & X86_EFLAGS_TF)) {
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
- kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1;
+ kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
+ DR6_RTM;
kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
@@ -5114,7 +5144,7 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r)
* cleared by the processor".
*/
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS;
+ vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
}
}
@@ -5133,7 +5163,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
vcpu->arch.eff_db);
if (dr6 != 0) {
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
get_segment_base(vcpu, VCPU_SREG_CS);
@@ -5144,14 +5174,15 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
}
}
- if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) {
+ if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
+ !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.dr7,
vcpu->arch.db);
if (dr6 != 0) {
vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= dr6;
+ vcpu->arch.dr6 |= dr6 | DR6_RTM;
kvm_queue_exception(vcpu, DB_VECTOR);
*r = EMULATE_DONE;
return true;
@@ -5215,6 +5246,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, ctxt->_eip);
+ if (ctxt->eflags & X86_EFLAGS_RF)
+ kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
return EMULATE_DONE;
}
@@ -5265,13 +5298,22 @@ restart:
r = EMULATE_DONE;
if (writeback) {
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
- kvm_vcpu_check_singlestep(vcpu, &r);
- kvm_set_rflags(vcpu, ctxt->eflags);
+ kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ __kvm_set_rflags(vcpu, ctxt->eflags);
+
+ /*
+ * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
+ * do nothing, and it will be requested again as soon as
+ * the shadow expires. But we still need to check here,
+ * because POPF has no interrupt shadow.
+ */
+ if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
} else
vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
@@ -5662,7 +5704,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
u64 param, ingpa, outgpa, ret;
uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
bool fast, longmode;
- int cs_db, cs_l;
/*
* hypercall generates UD from non zero cpl and real mode
@@ -5673,8 +5714,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
return 0;
}
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- longmode = is_long_mode(vcpu) && cs_l == 1;
+ longmode = is_64_bit_mode(vcpu);
if (!longmode) {
param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
@@ -5739,7 +5779,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
- int r = 1;
+ int op_64_bit, r = 1;
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
@@ -5752,7 +5792,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_hypercall(nr, a0, a1, a2, a3);
- if (!is_long_mode(vcpu)) {
+ op_64_bit = is_64_bit_mode(vcpu);
+ if (!op_64_bit) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
a1 &= 0xFFFFFFFF;
@@ -5778,6 +5819,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
break;
}
out:
+ if (!op_64_bit)
+ ret = (u32)ret;
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu->stat.hypercalls;
return r;
@@ -5856,6 +5899,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
trace_kvm_inj_exception(vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
+
+ if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+ __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+ X86_EFLAGS_RF);
+
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
@@ -5887,6 +5935,18 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_x86_ops->set_nmi(vcpu);
}
} else if (kvm_cpu_has_injectable_intr(vcpu)) {
+ /*
+ * Because interrupts can be injected asynchronously, we are
+ * calling check_nested_events again here to avoid a race condition.
+ * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
+ * proposal and current concerns. Perhaps we should be setting
+ * KVM_REQ_EVENT only on certain events and not unconditionally?
+ */
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
+ r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ if (r != 0)
+ return r;
+ }
if (kvm_x86_ops->interrupt_allowed(vcpu)) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
false);
@@ -6835,9 +6895,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
+ kvm_clear_interrupt_queue(vcpu);
+ kvm_clear_exception_queue(vcpu);
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
- vcpu->arch.dr6 = DR6_FIXED_1;
+ vcpu->arch.dr6 = DR6_INIT;
kvm_update_dr6(vcpu);
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -7393,12 +7455,17 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_rflags);
-void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
+void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+ __kvm_set_rflags(vcpu, rflags);
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 8c97bac9a89..306a1b77581 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -47,6 +47,16 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
#endif
}
+static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu)
+{
+ int cs_db, cs_l;
+
+ if (!is_long_mode(vcpu))
+ return false;
+ kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ return cs_l;
+}
+
static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
{
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
@@ -108,6 +118,23 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
return false;
}
+static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ unsigned long val = kvm_register_read(vcpu, reg);
+
+ return is_64_bit_mode(vcpu) ? val : (u32)val;
+}
+
+static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg,
+ unsigned long val)
+{
+ if (!is_64_bit_mode(vcpu))
+ val = (u32)val;
+ return kvm_register_write(vcpu, reg, val);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 36642793e31..a2419468151 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -577,6 +577,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
static const char nx_warning[] = KERN_CRIT
"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+static const char smep_warning[] = KERN_CRIT
+"unable to execute userspace code (SMEP?) (uid: %d)\n";
static void
show_fault_oops(struct pt_regs *regs, unsigned long error_code,
@@ -597,6 +599,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
if (pte && pte_present(*pte) && !pte_exec(*pte))
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
+ if (pte && pte_present(*pte) && pte_exec(*pte) &&
+ (pgd_flags(*pgd) & _PAGE_USER) &&
+ (read_cr4() & X86_CR4_SMEP))
+ printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
}
printk(KERN_ALERT "BUG: unable to handle kernel ");
@@ -1212,7 +1218,8 @@ good_area:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
- * the fault:
+ * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
fault = handle_mm_fault(mm, vma, address, flags);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f9713061811..66dba36f234 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,13 @@
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
+/*
+ * We need to define the tracepoints somewhere, and tlb.c
+ * is only compied when SMP=y.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/tlb.h>
+
#include "mm_internal.h"
static unsigned long __initdata pgt_buf_start;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e39504878ae..7d05565ba78 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -825,7 +825,8 @@ void __init mem_init(void)
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdata = NODE_DATA(nid);
- struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+ struct zone *zone = pgdata->node_zones +
+ zone_for_memory(nid, start, size, ZONE_HIGHMEM);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df1a9927ad2..5621c47d7a1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -691,7 +691,8 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+ struct zone *zone = pgdat->node_zones +
+ zone_for_memory(nid, start, size, ZONE_NORMAL);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dd8dda167a2..1fe33987de0 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -49,6 +49,7 @@ void leave_mm(int cpu)
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
load_cr3(swapper_pg_dir);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
}
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -102,20 +103,24 @@ static void flush_tlb_func(void *info)
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
+ if (!f->flush_end)
+ f->flush_end = f->flush_start + PAGE_SIZE;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
- if (f->flush_end == TLB_FLUSH_ALL)
+ if (f->flush_end == TLB_FLUSH_ALL) {
local_flush_tlb();
- else if (!f->flush_end)
- __flush_tlb_single(f->flush_start);
- else {
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
+ } else {
unsigned long addr;
+ unsigned long nr_pages =
+ f->flush_end - f->flush_start / PAGE_SIZE;
addr = f->flush_start;
while (addr < f->flush_end) {
__flush_tlb_single(addr);
addr += PAGE_SIZE;
}
+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
}
} else
leave_mm(smp_processor_id());
@@ -153,46 +158,45 @@ void flush_tlb_current_task(void)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
+/*
+ * See Documentation/x86/tlb.txt for details. We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead. Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+unsigned long tlb_single_page_flush_ceiling = 33;
+
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
- unsigned act_entries, tlb_entries = 0;
- unsigned long nr_base_pages;
+ /* do a global flush by default */
+ unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
if (current->active_mm != mm)
- goto flush_all;
+ goto out;
if (!current->mm) {
leave_mm(smp_processor_id());
- goto flush_all;
+ goto out;
}
- if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
- || vmflag & VM_HUGETLB) {
- local_flush_tlb();
- goto flush_all;
- }
-
- /* In modern CPU, last level tlb used for both data/ins */
- if (vmflag & VM_EXEC)
- tlb_entries = tlb_lli_4k[ENTRIES];
- else
- tlb_entries = tlb_lld_4k[ENTRIES];
+ if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+ base_pages_to_flush = (end - start) >> PAGE_SHIFT;
- /* Assume all of TLB entries was occupied by this task */
- act_entries = tlb_entries >> tlb_flushall_shift;
- act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
- nr_base_pages = (end - start) >> PAGE_SHIFT;
-
- /* tlb_flushall_shift is on balance point, details in commit log */
- if (nr_base_pages > act_entries) {
+ if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
+ base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
@@ -201,17 +205,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr);
}
-
- if (cpumask_any_but(mm_cpumask(mm),
- smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, end);
- preempt_enable();
- return;
}
-
-flush_all:
+ trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
+out:
+ if (base_pages_to_flush == TLB_FLUSH_ALL) {
+ start = 0UL;
+ end = TLB_FLUSH_ALL;
+ }
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+ flush_tlb_others(mm_cpumask(mm), mm, start, end);
preempt_enable();
}
@@ -260,32 +262,26 @@ static void do_kernel_range_flush(void *info)
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- unsigned act_entries;
- struct flush_tlb_info info;
-
- /* In modern CPU, last level tlb used for both data/ins */
- act_entries = tlb_lld_4k[ENTRIES];
/* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
- (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
-
+ if (end == TLB_FLUSH_ALL ||
+ (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
on_each_cpu(do_flush_tlb_all, NULL, 1);
- else {
+ } else {
+ struct flush_tlb_info info;
info.flush_start = start;
info.flush_end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
}
}
-#ifdef CONFIG_DEBUG_TLBFLUSH
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[32];
unsigned int len;
- len = sprintf(buf, "%hd\n", tlb_flushall_shift);
+ len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
@@ -294,20 +290,20 @@ static ssize_t tlbflush_write_file(struct file *file,
{
char buf[32];
ssize_t len;
- s8 shift;
+ int ceiling;
len = min(count, sizeof(buf) - 1);
if (copy_from_user(buf, user_buf, len))
return -EFAULT;
buf[len] = '\0';
- if (kstrtos8(buf, 0, &shift))
+ if (kstrtoint(buf, 0, &ceiling))
return -EINVAL;
- if (shift < -1 || shift >= BITS_PER_LONG)
+ if (ceiling < 0)
return -EINVAL;
- tlb_flushall_shift = shift;
+ tlb_single_page_flush_ceiling = ceiling;
return count;
}
@@ -317,11 +313,10 @@ static const struct file_operations fops_tlbflush = {
.llseek = default_llseek,
};
-static int __init create_tlb_flushall_shift(void)
+static int __init create_tlb_single_page_flush_ceiling(void)
{
- debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
+ debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_tlbflush);
return 0;
}
-late_initcall(create_tlb_flushall_shift);
-#endif
+late_initcall(create_tlb_single_page_flush_ceiling);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 99bef86ed6d..5c8cb8043c5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -211,10 +211,10 @@ struct jit_context {
bool seen_ld_abs;
};
-static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
- struct sock_filter_int *insn = bpf_prog->insnsi;
+ struct bpf_insn *insn = bpf_prog->insnsi;
int insn_cnt = bpf_prog->len;
u8 temp[64];
int i;
@@ -235,7 +235,7 @@ static int do_jit(struct sk_filter *bpf_prog, int *addrs, u8 *image,
/* mov qword ptr [rbp-X],rbx */
EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
- /* sk_convert_filter() maps classic BPF register X to R7 and uses R8
+ /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
* as temporary, so all tcpdump filters need to spill/fill R7(r13) and
* R8(r14). R9(r15) spill could be made conditional, but there is only
* one 'bpf_error' return path out of helper functions inside bpf_jit.S
@@ -841,7 +841,7 @@ common_load: ctx->seen_ld_abs = true;
/* By design x64 JIT should support all BPF instructions
* This error will be seen if new instruction was added
* to interpreter, but not to JIT
- * or if there is junk in sk_filter
+ * or if there is junk in bpf_prog
*/
pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
return -EINVAL;
@@ -862,11 +862,11 @@ common_load: ctx->seen_ld_abs = true;
return proglen;
}
-void bpf_jit_compile(struct sk_filter *prog)
+void bpf_jit_compile(struct bpf_prog *prog)
{
}
-void bpf_int_jit_compile(struct sk_filter *prog)
+void bpf_int_jit_compile(struct bpf_prog *prog)
{
struct bpf_binary_header *header = NULL;
int proglen, oldproglen = 0;
@@ -932,7 +932,7 @@ out:
static void bpf_jit_free_deferred(struct work_struct *work)
{
- struct sk_filter *fp = container_of(work, struct sk_filter, work);
+ struct bpf_prog *fp = container_of(work, struct bpf_prog, work);
unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
struct bpf_binary_header *header = (void *)addr;
@@ -941,7 +941,7 @@ static void bpf_jit_free_deferred(struct work_struct *work)
kfree(fp);
}
-void bpf_jit_free(struct sk_filter *fp)
+void bpf_jit_free(struct bpf_prog *fp)
{
if (fp->jited) {
INIT_WORK(&fp->work, bpf_jit_free_deferred);
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index b5e60268d93..c61ea57d1ba 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -326,6 +326,27 @@ static void pci_fixup_video(struct pci_dev *pdev)
struct pci_bus *bus;
u16 config;
+ if (!vga_default_device()) {
+ resource_size_t start, end;
+ int i;
+
+ /* Does firmware framebuffer belong to us? */
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+ continue;
+
+ start = pci_resource_start(pdev, i);
+ end = pci_resource_end(pdev, i);
+
+ if (!start || !end)
+ continue;
+
+ if (screen_info.lfb_base >= start &&
+ (screen_info.lfb_base + screen_info.lfb_size) < end)
+ vga_set_default_device(pdev);
+ }
+ }
+
/* Is VGA routed to us? */
bus = pdev->bus;
while (bus) {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index a19ed92e74e..2ae525e0d8b 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -162,6 +162,10 @@ pcibios_align_resource(void *data, const struct resource *res,
return start;
if (start & 0x300)
start = (start + 0x3ff) & ~0x3ff;
+ } else if (res->flags & IORESOURCE_MEM) {
+ /* The low 1MB range is reserved for ISA cards */
+ if (start < BIOS_END)
+ start = BIOS_END;
}
return start;
}
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index d51045afcaa..2846aaab510 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
+obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 87fc96bcc13..850da94fef3 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -56,13 +56,6 @@
#define EFI_DEBUG
-#define EFI_MIN_RESERVE 5120
-
-#define EFI_DUMMY_GUID \
- EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
-
-static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
-
struct efi_memory_map memmap;
static struct efi efi_phys __initdata;
@@ -95,139 +88,6 @@ static int __init setup_add_efi_memmap(char *arg)
}
early_param("add_efi_memmap", setup_add_efi_memmap);
-static bool efi_no_storage_paranoia;
-
-static int __init setup_storage_paranoia(char *arg)
-{
- efi_no_storage_paranoia = true;
- return 0;
-}
-early_param("efi_no_storage_paranoia", setup_storage_paranoia);
-
-static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt(get_time, tm, tc);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_set_time(efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt(set_time, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
- efi_bool_t *pending,
- efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt(get_wakeup_time, enabled, pending, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
-{
- unsigned long flags;
- efi_status_t status;
-
- spin_lock_irqsave(&rtc_lock, flags);
- status = efi_call_virt(set_wakeup_time, enabled, tm);
- spin_unlock_irqrestore(&rtc_lock, flags);
- return status;
-}
-
-static efi_status_t virt_efi_get_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- u32 *attr,
- unsigned long *data_size,
- void *data)
-{
- return efi_call_virt(get_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
- efi_char16_t *name,
- efi_guid_t *vendor)
-{
- return efi_call_virt(get_next_variable,
- name_size, name, vendor);
-}
-
-static efi_status_t virt_efi_set_variable(efi_char16_t *name,
- efi_guid_t *vendor,
- u32 attr,
- unsigned long data_size,
- void *data)
-{
- return efi_call_virt(set_variable,
- name, vendor, attr,
- data_size, data);
-}
-
-static efi_status_t virt_efi_query_variable_info(u32 attr,
- u64 *storage_space,
- u64 *remaining_space,
- u64 *max_variable_size)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt(query_variable_info, attr, storage_space,
- remaining_space, max_variable_size);
-}
-
-static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
-{
- return efi_call_virt(get_next_high_mono_count, count);
-}
-
-static void virt_efi_reset_system(int reset_type,
- efi_status_t status,
- unsigned long data_size,
- efi_char16_t *data)
-{
- __efi_call_virt(reset_system, reset_type, status,
- data_size, data);
-}
-
-static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
- unsigned long count,
- unsigned long sg_list)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt(update_capsule, capsules, count, sg_list);
-}
-
-static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
- unsigned long count,
- u64 *max_size,
- int *reset_type)
-{
- if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
- return EFI_UNSUPPORTED;
-
- return efi_call_virt(query_capsule_caps, capsules, count, max_size,
- reset_type);
-}
-
static efi_status_t __init phys_efi_set_virtual_address_map(
unsigned long memory_map_size,
unsigned long descriptor_size,
@@ -244,42 +104,6 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
return status;
}
-int efi_set_rtc_mmss(const struct timespec *now)
-{
- unsigned long nowtime = now->tv_sec;
- efi_status_t status;
- efi_time_t eft;
- efi_time_cap_t cap;
- struct rtc_time tm;
-
- status = efi.get_time(&eft, &cap);
- if (status != EFI_SUCCESS) {
- pr_err("Oops: efitime: can't read time!\n");
- return -1;
- }
-
- rtc_time_to_tm(nowtime, &tm);
- if (!rtc_valid_tm(&tm)) {
- eft.year = tm.tm_year + 1900;
- eft.month = tm.tm_mon + 1;
- eft.day = tm.tm_mday;
- eft.minute = tm.tm_min;
- eft.second = tm.tm_sec;
- eft.nanosecond = 0;
- } else {
- pr_err("%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
- __func__, nowtime);
- return -1;
- }
-
- status = efi.set_time(&eft);
- if (status != EFI_SUCCESS) {
- pr_err("Oops: efitime: can't write time!\n");
- return -1;
- }
- return 0;
-}
-
void efi_get_time(struct timespec *now)
{
efi_status_t status;
@@ -350,6 +174,9 @@ int __init efi_memblock_x86_reserve_range(void)
struct efi_info *e = &boot_params.efi_info;
unsigned long pmap;
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
#ifdef CONFIG_X86_32
/* Can't handle data above 4GB at this time */
if (e->efi_memmap_hi) {
@@ -392,69 +219,15 @@ static void __init print_efi_memmap(void)
#endif /* EFI_DEBUG */
}
-void __init efi_reserve_boot_services(void)
-{
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- u64 start = md->phys_addr;
- u64 size = md->num_pages << EFI_PAGE_SHIFT;
-
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
- continue;
- /* Only reserve where possible:
- * - Not within any already allocated areas
- * - Not over any memory area (really needed, if above?)
- * - Not within any part of the kernel
- * - Not the bios reserved area
- */
- if ((start + size > __pa_symbol(_text)
- && start <= __pa_symbol(_end)) ||
- !e820_all_mapped(start, start+size, E820_RAM) ||
- memblock_is_region_reserved(start, size)) {
- /* Could not reserve, skip it */
- md->num_pages = 0;
- memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
- start, start+size-1);
- } else
- memblock_reserve(start, size);
- }
-}
-
void __init efi_unmap_memmap(void)
{
clear_bit(EFI_MEMMAP, &efi.flags);
if (memmap.map) {
- early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
+ early_memunmap(memmap.map, memmap.nr_map * memmap.desc_size);
memmap.map = NULL;
}
}
-void __init efi_free_boot_services(void)
-{
- void *p;
-
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
- efi_memory_desc_t *md = p;
- unsigned long long start = md->phys_addr;
- unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
-
- if (md->type != EFI_BOOT_SERVICES_CODE &&
- md->type != EFI_BOOT_SERVICES_DATA)
- continue;
-
- /* Could not reserve boot area */
- if (!size)
- continue;
-
- free_bootmem_late(start, size);
- }
-
- efi_unmap_memmap();
-}
-
static int __init efi_systab_init(void *phys)
{
if (efi_enabled(EFI_64BIT)) {
@@ -467,12 +240,12 @@ static int __init efi_systab_init(void *phys)
if (!data)
return -ENOMEM;
}
- systab64 = early_ioremap((unsigned long)phys,
+ systab64 = early_memremap((unsigned long)phys,
sizeof(*systab64));
if (systab64 == NULL) {
pr_err("Couldn't map the system table!\n");
if (data)
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
return -ENOMEM;
}
@@ -504,9 +277,9 @@ static int __init efi_systab_init(void *phys)
systab64->tables;
tmp |= data ? data->tables : systab64->tables;
- early_iounmap(systab64, sizeof(*systab64));
+ early_memunmap(systab64, sizeof(*systab64));
if (data)
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
#ifdef CONFIG_X86_32
if (tmp >> 32) {
pr_err("EFI data located above 4GB, disabling EFI.\n");
@@ -516,7 +289,7 @@ static int __init efi_systab_init(void *phys)
} else {
efi_system_table_32_t *systab32;
- systab32 = early_ioremap((unsigned long)phys,
+ systab32 = early_memremap((unsigned long)phys,
sizeof(*systab32));
if (systab32 == NULL) {
pr_err("Couldn't map the system table!\n");
@@ -537,7 +310,7 @@ static int __init efi_systab_init(void *phys)
efi_systab.nr_tables = systab32->nr_tables;
efi_systab.tables = systab32->tables;
- early_iounmap(systab32, sizeof(*systab32));
+ early_memunmap(systab32, sizeof(*systab32));
}
efi.systab = &efi_systab;
@@ -563,7 +336,7 @@ static int __init efi_runtime_init32(void)
{
efi_runtime_services_32_t *runtime;
- runtime = early_ioremap((unsigned long)efi.systab->runtime,
+ runtime = early_memremap((unsigned long)efi.systab->runtime,
sizeof(efi_runtime_services_32_t));
if (!runtime) {
pr_err("Could not map the runtime service table!\n");
@@ -578,7 +351,7 @@ static int __init efi_runtime_init32(void)
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
(unsigned long)runtime->set_virtual_address_map;
- early_iounmap(runtime, sizeof(efi_runtime_services_32_t));
+ early_memunmap(runtime, sizeof(efi_runtime_services_32_t));
return 0;
}
@@ -587,7 +360,7 @@ static int __init efi_runtime_init64(void)
{
efi_runtime_services_64_t *runtime;
- runtime = early_ioremap((unsigned long)efi.systab->runtime,
+ runtime = early_memremap((unsigned long)efi.systab->runtime,
sizeof(efi_runtime_services_64_t));
if (!runtime) {
pr_err("Could not map the runtime service table!\n");
@@ -602,7 +375,7 @@ static int __init efi_runtime_init64(void)
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
(unsigned long)runtime->set_virtual_address_map;
- early_iounmap(runtime, sizeof(efi_runtime_services_64_t));
+ early_memunmap(runtime, sizeof(efi_runtime_services_64_t));
return 0;
}
@@ -616,14 +389,24 @@ static int __init efi_runtime_init(void)
* the runtime services table so that we can grab the physical
* address of several of the EFI runtime functions, needed to
* set the firmware into virtual mode.
+ *
+ * When EFI_PARAVIRT is in force then we could not map runtime
+ * service memory region because we do not have direct access to it.
+ * However, runtime services are available through proxy functions
+ * (e.g. in case of Xen dom0 EFI implementation they call special
+ * hypercall which executes relevant EFI functions) and that is why
+ * they are always enabled.
*/
- if (efi_enabled(EFI_64BIT))
- rv = efi_runtime_init64();
- else
- rv = efi_runtime_init32();
- if (rv)
- return rv;
+ if (!efi_enabled(EFI_PARAVIRT)) {
+ if (efi_enabled(EFI_64BIT))
+ rv = efi_runtime_init64();
+ else
+ rv = efi_runtime_init32();
+
+ if (rv)
+ return rv;
+ }
set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
@@ -632,8 +415,11 @@ static int __init efi_runtime_init(void)
static int __init efi_memmap_init(void)
{
+ if (efi_enabled(EFI_PARAVIRT))
+ return 0;
+
/* Map the EFI memory map */
- memmap.map = early_ioremap((unsigned long)memmap.phys_map,
+ memmap.map = early_memremap((unsigned long)memmap.phys_map,
memmap.nr_map * memmap.desc_size);
if (memmap.map == NULL) {
pr_err("Could not map the memory map!\n");
@@ -649,62 +435,6 @@ static int __init efi_memmap_init(void)
return 0;
}
-/*
- * A number of config table entries get remapped to virtual addresses
- * after entering EFI virtual mode. However, the kexec kernel requires
- * their physical addresses therefore we pass them via setup_data and
- * correct those entries to their respective physical addresses here.
- *
- * Currently only handles smbios which is necessary for some firmware
- * implementation.
- */
-static int __init efi_reuse_config(u64 tables, int nr_tables)
-{
- int i, sz, ret = 0;
- void *p, *tablep;
- struct efi_setup_data *data;
-
- if (!efi_setup)
- return 0;
-
- if (!efi_enabled(EFI_64BIT))
- return 0;
-
- data = early_memremap(efi_setup, sizeof(*data));
- if (!data) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (!data->smbios)
- goto out_memremap;
-
- sz = sizeof(efi_config_table_64_t);
-
- p = tablep = early_memremap(tables, nr_tables * sz);
- if (!p) {
- pr_err("Could not map Configuration table!\n");
- ret = -ENOMEM;
- goto out_memremap;
- }
-
- for (i = 0; i < efi.systab->nr_tables; i++) {
- efi_guid_t guid;
-
- guid = ((efi_config_table_64_t *)p)->guid;
-
- if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
- ((efi_config_table_64_t *)p)->table = data->smbios;
- p += sz;
- }
- early_iounmap(tablep, nr_tables * sz);
-
-out_memremap:
- early_iounmap(data, sizeof(*data));
-out:
- return ret;
-}
-
void __init efi_init(void)
{
efi_char16_t *c16;
@@ -728,8 +458,6 @@ void __init efi_init(void)
if (efi_systab_init(efi_phys.systab))
return;
- set_bit(EFI_SYSTEM_TABLES, &efi.flags);
-
efi.config_table = (unsigned long)efi.systab->tables;
efi.fw_vendor = (unsigned long)efi.systab->fw_vendor;
efi.runtime = (unsigned long)efi.systab->runtime;
@@ -737,14 +465,14 @@ void __init efi_init(void)
/*
* Show what we know for posterity
*/
- c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
+ c16 = tmp = early_memremap(efi.systab->fw_vendor, 2);
if (c16) {
for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
vendor[i] = *c16++;
vendor[i] = '\0';
} else
pr_err("Could not map the firmware vendor!\n");
- early_iounmap(tmp, 2);
+ early_memunmap(tmp, 2);
pr_info("EFI v%u.%.02u by %s\n",
efi.systab->hdr.revision >> 16,
@@ -770,8 +498,6 @@ void __init efi_init(void)
if (efi_memmap_init())
return;
- set_bit(EFI_MEMMAP, &efi.flags);
-
print_efi_memmap();
}
@@ -847,22 +573,6 @@ void __init old_map_region(efi_memory_desc_t *md)
(unsigned long long)md->phys_addr);
}
-static void native_runtime_setup(void)
-{
- efi.get_time = virt_efi_get_time;
- efi.set_time = virt_efi_set_time;
- efi.get_wakeup_time = virt_efi_get_wakeup_time;
- efi.set_wakeup_time = virt_efi_set_wakeup_time;
- efi.get_variable = virt_efi_get_variable;
- efi.get_next_variable = virt_efi_get_next_variable;
- efi.set_variable = virt_efi_set_variable;
- efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
- efi.reset_system = virt_efi_reset_system;
- efi.query_variable_info = virt_efi_query_variable_info;
- efi.update_capsule = virt_efi_update_capsule;
- efi.query_capsule_caps = virt_efi_query_capsule_caps;
-}
-
/* Merge contiguous regions of the same type and attribute */
static void __init efi_merge_regions(void)
{
@@ -1049,7 +759,7 @@ static void __init kexec_enter_virtual_mode(void)
*/
efi.runtime_version = efi_systab.hdr.revision;
- native_runtime_setup();
+ efi_native_runtime_setup();
efi.set_virtual_address_map = NULL;
@@ -1057,11 +767,7 @@ static void __init kexec_enter_virtual_mode(void)
runtime_code_page_mkexec();
/* clean DUMMY object */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, NULL);
+ efi_delete_dummy_variable();
#endif
}
@@ -1142,7 +848,7 @@ static void __init __efi_enter_virtual_mode(void)
efi.runtime_version = efi_systab.hdr.revision;
if (efi_is_native())
- native_runtime_setup();
+ efi_native_runtime_setup();
else
efi_thunk_runtime_setup();
@@ -1179,15 +885,14 @@ static void __init __efi_enter_virtual_mode(void)
free_pages((unsigned long)new_memmap, pg_shift);
/* clean DUMMY object */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, NULL);
+ efi_delete_dummy_variable();
}
void __init efi_enter_virtual_mode(void)
{
+ if (efi_enabled(EFI_PARAVIRT))
+ return;
+
if (efi_setup)
kexec_enter_virtual_mode();
else
@@ -1220,6 +925,9 @@ u64 efi_mem_attributes(unsigned long phys_addr)
efi_memory_desc_t *md;
void *p;
+ if (!efi_enabled(EFI_MEMMAP))
+ return 0;
+
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
md = p;
if ((md->phys_addr <= phys_addr) &&
@@ -1230,86 +938,6 @@ u64 efi_mem_attributes(unsigned long phys_addr)
return 0;
}
-/*
- * Some firmware implementations refuse to boot if there's insufficient space
- * in the variable store. Ensure that we never use more than a safe limit.
- *
- * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
- * store.
- */
-efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
-{
- efi_status_t status;
- u64 storage_size, remaining_size, max_size;
-
- if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
- return 0;
-
- status = efi.query_variable_info(attributes, &storage_size,
- &remaining_size, &max_size);
- if (status != EFI_SUCCESS)
- return status;
-
- /*
- * We account for that by refusing the write if permitting it would
- * reduce the available space to under 5KB. This figure was provided by
- * Samsung, so should be safe.
- */
- if ((remaining_size - size < EFI_MIN_RESERVE) &&
- !efi_no_storage_paranoia) {
-
- /*
- * Triggering garbage collection may require that the firmware
- * generate a real EFI_OUT_OF_RESOURCES error. We can force
- * that by attempting to use more space than is available.
- */
- unsigned long dummy_size = remaining_size + 1024;
- void *dummy = kzalloc(dummy_size, GFP_ATOMIC);
-
- if (!dummy)
- return EFI_OUT_OF_RESOURCES;
-
- status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- dummy_size, dummy);
-
- if (status == EFI_SUCCESS) {
- /*
- * This should have failed, so if it didn't make sure
- * that we delete it...
- */
- efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
- EFI_VARIABLE_NON_VOLATILE |
- EFI_VARIABLE_BOOTSERVICE_ACCESS |
- EFI_VARIABLE_RUNTIME_ACCESS,
- 0, dummy);
- }
-
- kfree(dummy);
-
- /*
- * The runtime code may now have triggered a garbage collection
- * run, so check the variable info again
- */
- status = efi.query_variable_info(attributes, &storage_size,
- &remaining_size, &max_size);
-
- if (status != EFI_SUCCESS)
- return status;
-
- /*
- * There still isn't enough room, so return an error
- */
- if (remaining_size - size < EFI_MIN_RESERVE)
- return EFI_OUT_OF_RESOURCES;
- }
-
- return EFI_SUCCESS;
-}
-EXPORT_SYMBOL_GPL(efi_query_variable_store);
-
static int __init parse_efi_cmdline(char *str)
{
if (*str == '=')
@@ -1321,22 +949,3 @@ static int __init parse_efi_cmdline(char *str)
return 0;
}
early_param("efi", parse_efi_cmdline);
-
-void __init efi_apply_memmap_quirks(void)
-{
- /*
- * Once setup is done earlier, unmap the EFI memory map on mismatched
- * firmware/kernel architectures since there is no support for runtime
- * services.
- */
- if (!efi_runtime_supported()) {
- pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
- efi_unmap_memmap();
- }
-
- /*
- * UV doesn't support the new EFI pagetable mapping yet.
- */
- if (is_uv_system())
- set_bit(EFI_OLD_MEMMAP, &efi.flags);
-}
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
new file mode 100644
index 00000000000..1c7380da65f
--- /dev/null
+++ b/arch/x86/platform/efi/quirks.c
@@ -0,0 +1,290 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/time.h>
+#include <linux/types.h>
+#include <linux/efi.h>
+#include <linux/slab.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <linux/acpi.h>
+#include <asm/efi.h>
+#include <asm/uv/uv.h>
+
+#define EFI_MIN_RESERVE 5120
+
+#define EFI_DUMMY_GUID \
+ EFI_GUID(0x4424ac57, 0xbe4b, 0x47dd, 0x9e, 0x97, 0xed, 0x50, 0xf0, 0x9f, 0x92, 0xa9)
+
+static efi_char16_t efi_dummy_name[6] = { 'D', 'U', 'M', 'M', 'Y', 0 };
+
+static bool efi_no_storage_paranoia;
+
+/*
+ * Some firmware implementations refuse to boot if there's insufficient
+ * space in the variable store. The implementation of garbage collection
+ * in some FW versions causes stale (deleted) variables to take up space
+ * longer than intended and space is only freed once the store becomes
+ * almost completely full.
+ *
+ * Enabling this option disables the space checks in
+ * efi_query_variable_store() and forces garbage collection.
+ *
+ * Only enable this option if deleting EFI variables does not free up
+ * space in your variable store, e.g. if despite deleting variables
+ * you're unable to create new ones.
+ */
+static int __init setup_storage_paranoia(char *arg)
+{
+ efi_no_storage_paranoia = true;
+ return 0;
+}
+early_param("efi_no_storage_paranoia", setup_storage_paranoia);
+
+/*
+ * Deleting the dummy variable which kicks off garbage collection
+*/
+void efi_delete_dummy_variable(void)
+{
+ efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ 0, NULL);
+}
+
+/*
+ * Some firmware implementations refuse to boot if there's insufficient space
+ * in the variable store. Ensure that we never use more than a safe limit.
+ *
+ * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
+ * store.
+ */
+efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+{
+ efi_status_t status;
+ u64 storage_size, remaining_size, max_size;
+
+ if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
+ return 0;
+
+ status = efi.query_variable_info(attributes, &storage_size,
+ &remaining_size, &max_size);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /*
+ * We account for that by refusing the write if permitting it would
+ * reduce the available space to under 5KB. This figure was provided by
+ * Samsung, so should be safe.
+ */
+ if ((remaining_size - size < EFI_MIN_RESERVE) &&
+ !efi_no_storage_paranoia) {
+
+ /*
+ * Triggering garbage collection may require that the firmware
+ * generate a real EFI_OUT_OF_RESOURCES error. We can force
+ * that by attempting to use more space than is available.
+ */
+ unsigned long dummy_size = remaining_size + 1024;
+ void *dummy = kzalloc(dummy_size, GFP_ATOMIC);
+
+ if (!dummy)
+ return EFI_OUT_OF_RESOURCES;
+
+ status = efi.set_variable(efi_dummy_name, &EFI_DUMMY_GUID,
+ EFI_VARIABLE_NON_VOLATILE |
+ EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ EFI_VARIABLE_RUNTIME_ACCESS,
+ dummy_size, dummy);
+
+ if (status == EFI_SUCCESS) {
+ /*
+ * This should have failed, so if it didn't make sure
+ * that we delete it...
+ */
+ efi_delete_dummy_variable();
+ }
+
+ kfree(dummy);
+
+ /*
+ * The runtime code may now have triggered a garbage collection
+ * run, so check the variable info again
+ */
+ status = efi.query_variable_info(attributes, &storage_size,
+ &remaining_size, &max_size);
+
+ if (status != EFI_SUCCESS)
+ return status;
+
+ /*
+ * There still isn't enough room, so return an error
+ */
+ if (remaining_size - size < EFI_MIN_RESERVE)
+ return EFI_OUT_OF_RESOURCES;
+ }
+
+ return EFI_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(efi_query_variable_store);
+
+/*
+ * The UEFI specification makes it clear that the operating system is free to do
+ * whatever it wants with boot services code after ExitBootServices() has been
+ * called. Ignoring this recommendation a significant bunch of EFI implementations
+ * continue calling into boot services code (SetVirtualAddressMap). In order to
+ * work around such buggy implementations we reserve boot services region during
+ * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it
+* is discarded.
+*/
+void __init efi_reserve_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ u64 start = md->phys_addr;
+ u64 size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+ /* Only reserve where possible:
+ * - Not within any already allocated areas
+ * - Not over any memory area (really needed, if above?)
+ * - Not within any part of the kernel
+ * - Not the bios reserved area
+ */
+ if ((start + size > __pa_symbol(_text)
+ && start <= __pa_symbol(_end)) ||
+ !e820_all_mapped(start, start+size, E820_RAM) ||
+ memblock_is_region_reserved(start, size)) {
+ /* Could not reserve, skip it */
+ md->num_pages = 0;
+ memblock_dbg("Could not reserve boot range [0x%010llx-0x%010llx]\n",
+ start, start+size-1);
+ } else
+ memblock_reserve(start, size);
+ }
+}
+
+void __init efi_free_boot_services(void)
+{
+ void *p;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ efi_memory_desc_t *md = p;
+ unsigned long long start = md->phys_addr;
+ unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_BOOT_SERVICES_DATA)
+ continue;
+
+ /* Could not reserve boot area */
+ if (!size)
+ continue;
+
+ free_bootmem_late(start, size);
+ }
+
+ efi_unmap_memmap();
+}
+
+/*
+ * A number of config table entries get remapped to virtual addresses
+ * after entering EFI virtual mode. However, the kexec kernel requires
+ * their physical addresses therefore we pass them via setup_data and
+ * correct those entries to their respective physical addresses here.
+ *
+ * Currently only handles smbios which is necessary for some firmware
+ * implementation.
+ */
+int __init efi_reuse_config(u64 tables, int nr_tables)
+{
+ int i, sz, ret = 0;
+ void *p, *tablep;
+ struct efi_setup_data *data;
+
+ if (!efi_setup)
+ return 0;
+
+ if (!efi_enabled(EFI_64BIT))
+ return 0;
+
+ data = early_memremap(efi_setup, sizeof(*data));
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (!data->smbios)
+ goto out_memremap;
+
+ sz = sizeof(efi_config_table_64_t);
+
+ p = tablep = early_memremap(tables, nr_tables * sz);
+ if (!p) {
+ pr_err("Could not map Configuration table!\n");
+ ret = -ENOMEM;
+ goto out_memremap;
+ }
+
+ for (i = 0; i < efi.systab->nr_tables; i++) {
+ efi_guid_t guid;
+
+ guid = ((efi_config_table_64_t *)p)->guid;
+
+ if (!efi_guidcmp(guid, SMBIOS_TABLE_GUID))
+ ((efi_config_table_64_t *)p)->table = data->smbios;
+ p += sz;
+ }
+ early_memunmap(tablep, nr_tables * sz);
+
+out_memremap:
+ early_memunmap(data, sizeof(*data));
+out:
+ return ret;
+}
+
+void __init efi_apply_memmap_quirks(void)
+{
+ /*
+ * Once setup is done earlier, unmap the EFI memory map on mismatched
+ * firmware/kernel architectures since there is no support for runtime
+ * services.
+ */
+ if (!efi_runtime_supported()) {
+ pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
+ efi_unmap_memmap();
+ }
+
+ /*
+ * UV doesn't support the new EFI pagetable mapping yet.
+ */
+ if (is_uv_system())
+ set_bit(EFI_OLD_MEMMAP, &efi.flags);
+}
+
+/*
+ * For most modern platforms the preferred method of powering off is via
+ * ACPI. However, there are some that are known to require the use of
+ * EFI runtime services and for which ACPI does not work at all.
+ *
+ * Using EFI is a last resort, to be used only if no other option
+ * exists.
+ */
+bool efi_reboot_required(void)
+{
+ if (!acpi_gbl_reduced_hardware)
+ return false;
+
+ efi_reboot_quirk_mode = EFI_RESET_WARM;
+ return true;
+}
+
+bool efi_poweroff_required(void)
+{
+ return !!acpi_gbl_reduced_hardware;
+}
diff --git a/arch/x86/platform/ts5500/ts5500.c b/arch/x86/platform/ts5500/ts5500.c
index 9471b9456f2..baf16e72e66 100644
--- a/arch/x86/platform/ts5500/ts5500.c
+++ b/arch/x86/platform/ts5500/ts5500.c
@@ -1,7 +1,7 @@
/*
* Technologic Systems TS-5500 Single Board Computer support
*
- * Copyright (C) 2013 Savoir-faire Linux Inc.
+ * Copyright (C) 2013-2014 Savoir-faire Linux Inc.
* Vivien Didelot <vivien.didelot@savoirfairelinux.com>
*
* This program is free software; you can redistribute it and/or modify it under
@@ -15,8 +15,8 @@
* state or available options. For further information about sysfs entries, see
* Documentation/ABI/testing/sysfs-platform-ts5500.
*
- * This code actually supports the TS-5500 platform, but it may be extended to
- * support similar Technologic Systems x86-based platforms, such as the TS-5600.
+ * This code may be extended to support similar x86-based platforms.
+ * Actually, the TS-5500 and TS-5400 are supported.
*/
#include <linux/delay.h>
@@ -32,6 +32,7 @@
/* Product code register */
#define TS5500_PRODUCT_CODE_ADDR 0x74
#define TS5500_PRODUCT_CODE 0x60 /* TS-5500 product code */
+#define TS5400_PRODUCT_CODE 0x40 /* TS-5400 product code */
/* SRAM/RS-485/ADC options, and RS-485 RTS/Automatic RS-485 flags register */
#define TS5500_SRAM_RS485_ADC_ADDR 0x75
@@ -66,6 +67,7 @@
/**
* struct ts5500_sbc - TS-5500 board description
+ * @name: Board model name.
* @id: Board product ID.
* @sram: Flag for SRAM option.
* @rs485: Flag for RS-485 option.
@@ -75,6 +77,7 @@
* @jumpers: Bitfield for jumpers' state.
*/
struct ts5500_sbc {
+ const char *name;
int id;
bool sram;
bool rs485;
@@ -122,13 +125,16 @@ static int __init ts5500_detect_config(struct ts5500_sbc *sbc)
if (!request_region(TS5500_PRODUCT_CODE_ADDR, 4, "ts5500"))
return -EBUSY;
- tmp = inb(TS5500_PRODUCT_CODE_ADDR);
- if (tmp != TS5500_PRODUCT_CODE) {
- pr_err("This platform is not a TS-5500 (found ID 0x%x)\n", tmp);
+ sbc->id = inb(TS5500_PRODUCT_CODE_ADDR);
+ if (sbc->id == TS5500_PRODUCT_CODE) {
+ sbc->name = "TS-5500";
+ } else if (sbc->id == TS5400_PRODUCT_CODE) {
+ sbc->name = "TS-5400";
+ } else {
+ pr_err("ts5500: unknown product code 0x%x\n", sbc->id);
ret = -ENODEV;
goto cleanup;
}
- sbc->id = tmp;
tmp = inb(TS5500_SRAM_RS485_ADC_ADDR);
sbc->sram = tmp & TS5500_SRAM;
@@ -147,48 +153,52 @@ cleanup:
return ret;
}
-static ssize_t ts5500_show_id(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct ts5500_sbc *sbc = dev_get_drvdata(dev);
- return sprintf(buf, "0x%.2x\n", sbc->id);
+ return sprintf(buf, "%s\n", sbc->name);
}
+static DEVICE_ATTR_RO(name);
-static ssize_t ts5500_show_jumpers(struct device *dev,
- struct device_attribute *attr,
- char *buf)
+static ssize_t id_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
{
struct ts5500_sbc *sbc = dev_get_drvdata(dev);
- return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1);
+ return sprintf(buf, "0x%.2x\n", sbc->id);
}
+static DEVICE_ATTR_RO(id);
-#define TS5500_SHOW(field) \
- static ssize_t ts5500_show_##field(struct device *dev, \
- struct device_attribute *attr, \
- char *buf) \
- { \
- struct ts5500_sbc *sbc = dev_get_drvdata(dev); \
- return sprintf(buf, "%d\n", sbc->field); \
- }
-
-TS5500_SHOW(sram)
-TS5500_SHOW(rs485)
-TS5500_SHOW(adc)
-TS5500_SHOW(ereset)
-TS5500_SHOW(itr)
+static ssize_t jumpers_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev);
-static DEVICE_ATTR(id, S_IRUGO, ts5500_show_id, NULL);
-static DEVICE_ATTR(jumpers, S_IRUGO, ts5500_show_jumpers, NULL);
-static DEVICE_ATTR(sram, S_IRUGO, ts5500_show_sram, NULL);
-static DEVICE_ATTR(rs485, S_IRUGO, ts5500_show_rs485, NULL);
-static DEVICE_ATTR(adc, S_IRUGO, ts5500_show_adc, NULL);
-static DEVICE_ATTR(ereset, S_IRUGO, ts5500_show_ereset, NULL);
-static DEVICE_ATTR(itr, S_IRUGO, ts5500_show_itr, NULL);
+ return sprintf(buf, "0x%.2x\n", sbc->jumpers >> 1);
+}
+static DEVICE_ATTR_RO(jumpers);
+
+#define TS5500_ATTR_BOOL(_field) \
+ static ssize_t _field##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
+ { \
+ struct ts5500_sbc *sbc = dev_get_drvdata(dev); \
+ \
+ return sprintf(buf, "%d\n", sbc->_field); \
+ } \
+ static DEVICE_ATTR_RO(_field)
+
+TS5500_ATTR_BOOL(sram);
+TS5500_ATTR_BOOL(rs485);
+TS5500_ATTR_BOOL(adc);
+TS5500_ATTR_BOOL(ereset);
+TS5500_ATTR_BOOL(itr);
static struct attribute *ts5500_attributes[] = {
&dev_attr_id.attr,
+ &dev_attr_name.attr,
&dev_attr_jumpers.attr,
&dev_attr_sram.attr,
&dev_attr_rs485.attr,
@@ -311,12 +321,14 @@ static int __init ts5500_init(void)
if (err)
goto error;
- ts5500_dio1_pdev.dev.parent = &pdev->dev;
- if (platform_device_register(&ts5500_dio1_pdev))
- dev_warn(&pdev->dev, "DIO1 block registration failed\n");
- ts5500_dio2_pdev.dev.parent = &pdev->dev;
- if (platform_device_register(&ts5500_dio2_pdev))
- dev_warn(&pdev->dev, "DIO2 block registration failed\n");
+ if (sbc->id == TS5500_PRODUCT_CODE) {
+ ts5500_dio1_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_dio1_pdev))
+ dev_warn(&pdev->dev, "DIO1 block registration failed\n");
+ ts5500_dio2_pdev.dev.parent = &pdev->dev;
+ if (platform_device_register(&ts5500_dio2_pdev))
+ dev_warn(&pdev->dev, "DIO2 block registration failed\n");
+ }
if (led_classdev_register(&pdev->dev, &ts5500_led_cdev))
dev_warn(&pdev->dev, "LED registration failed\n");
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index dfe605ac1bc..3968d67d366 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
/*
* SGI UltraViolet TLB flush routines.
*
- * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
+ * (c) 2008-2014 Cliff Wickman <cpw@sgi.com>, SGI.
*
* This code is released under the GNU General Public License version 2 or
* later.
@@ -451,7 +451,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
/*
* The reverse of the above; converts a duration in ns to a duration in cycles.
- */
+ */
static inline unsigned long long ns_2_cycles(unsigned long long ns)
{
struct cyc2ns_data *data = cyc2ns_read_begin();
@@ -563,7 +563,7 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
* UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
* But not currently used.
*/
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
+static unsigned long uv2_3_read_status(unsigned long offset, int rshft, int desc)
{
unsigned long descriptor_status;
@@ -606,7 +606,7 @@ int handle_uv2_busy(struct bau_control *bcp)
return FLUSH_GIVEUP;
}
-static int uv2_wait_completion(struct bau_desc *bau_desc,
+static int uv2_3_wait_completion(struct bau_desc *bau_desc,
unsigned long mmr_offset, int right_shift,
struct bau_control *bcp, long try)
{
@@ -616,7 +616,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
long busy_reps = 0;
struct ptc_stats *stat = bcp->statp;
- descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);
+ descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
/* spin on the status MMR, waiting for it to go idle */
while (descriptor_stat != UV2H_DESC_IDLE) {
@@ -658,8 +658,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
/* not to hammer on the clock */
busy_reps = 0;
ttm = get_cycles();
- if ((ttm - bcp->send_message) >
- bcp->timeout_interval)
+ if ((ttm - bcp->send_message) > bcp->timeout_interval)
return handle_uv2_busy(bcp);
}
/*
@@ -667,8 +666,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
*/
cpu_relax();
}
- descriptor_stat = uv2_read_status(mmr_offset, right_shift,
- desc);
+ descriptor_stat = uv2_3_read_status(mmr_offset, right_shift, desc);
}
bcp->conseccompletes++;
return FLUSH_COMPLETE;
@@ -679,8 +677,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
* which register to read and position in that register based on cpu in
* current hub.
*/
-static int wait_completion(struct bau_desc *bau_desc,
- struct bau_control *bcp, long try)
+static int wait_completion(struct bau_desc *bau_desc, struct bau_control *bcp, long try)
{
int right_shift;
unsigned long mmr_offset;
@@ -695,11 +692,9 @@ static int wait_completion(struct bau_desc *bau_desc,
}
if (bcp->uvhub_version == 1)
- return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
- bcp, try);
+ return uv1_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
else
- return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
- bcp, try);
+ return uv2_3_wait_completion(bau_desc, mmr_offset, right_shift, bcp, try);
}
/*
@@ -888,7 +883,7 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
struct ptc_stats *stat = bcp->statp;
struct bau_control *hmaster = bcp->uvhub_master;
struct uv1_bau_msg_header *uv1_hdr = NULL;
- struct uv2_bau_msg_header *uv2_hdr = NULL;
+ struct uv2_3_bau_msg_header *uv2_3_hdr = NULL;
if (bcp->uvhub_version == 1) {
uv1 = 1;
@@ -902,27 +897,28 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
if (uv1)
uv1_hdr = &bau_desc->header.uv1_hdr;
else
- uv2_hdr = &bau_desc->header.uv2_hdr;
+ /* uv2 and uv3 */
+ uv2_3_hdr = &bau_desc->header.uv2_3_hdr;
do {
if (try == 0) {
if (uv1)
uv1_hdr->msg_type = MSG_REGULAR;
else
- uv2_hdr->msg_type = MSG_REGULAR;
+ uv2_3_hdr->msg_type = MSG_REGULAR;
seq_number = bcp->message_number++;
} else {
if (uv1)
uv1_hdr->msg_type = MSG_RETRY;
else
- uv2_hdr->msg_type = MSG_RETRY;
+ uv2_3_hdr->msg_type = MSG_RETRY;
stat->s_retry_messages++;
}
if (uv1)
uv1_hdr->sequence = seq_number;
else
- uv2_hdr->sequence = seq_number;
+ uv2_3_hdr->sequence = seq_number;
index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
bcp->send_message = get_cycles();
@@ -1080,8 +1076,10 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
* done. The returned pointer is valid till preemption is re-enabled.
*/
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned int cpu)
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end,
+ unsigned int cpu)
{
int locals = 0;
int remotes = 0;
@@ -1268,6 +1266,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
if (bcp->uvhub_version == 2)
process_uv2_message(&msgdesc, bcp);
else
+ /* no error workaround for uv1 or uv3 */
bau_process_message(&msgdesc, bcp, 1);
msg++;
@@ -1325,8 +1324,12 @@ static void __init enable_timeouts(void)
*/
mmr_image |= (1L << SOFTACK_MSHIFT);
if (is_uv2_hub()) {
+ /* do not touch the legacy mode bit */
/* hw bug workaround; do not use extended status */
mmr_image &= ~(1L << UV2_EXT_SHFT);
+ } else if (is_uv3_hub()) {
+ mmr_image &= ~(1L << PREFETCH_HINT_SHFT);
+ mmr_image |= (1L << SB_STATUS_SHFT);
}
write_mmr_misc_control(pnode, mmr_image);
}
@@ -1476,7 +1479,7 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
return count;
}
- if (strict_strtol(optstr, 10, &input_arg) < 0) {
+ if (kstrtol(optstr, 10, &input_arg) < 0) {
printk(KERN_DEBUG "%s is invalid\n", optstr);
return -EINVAL;
}
@@ -1692,7 +1695,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
struct bau_desc *bau_desc;
struct bau_desc *bd2;
struct uv1_bau_msg_header *uv1_hdr;
- struct uv2_bau_msg_header *uv2_hdr;
+ struct uv2_3_bau_msg_header *uv2_3_hdr;
struct bau_control *bcp;
/*
@@ -1739,15 +1742,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
*/
} else {
/*
- * BIOS uses legacy mode, but UV2 hardware always
+ * BIOS uses legacy mode, but uv2 and uv3 hardware always
* uses native mode for selective broadcasts.
*/
- uv2_hdr = &bd2->header.uv2_hdr;
- uv2_hdr->swack_flag = 1;
- uv2_hdr->base_dest_nasid =
+ uv2_3_hdr = &bd2->header.uv2_3_hdr;
+ uv2_3_hdr->swack_flag = 1;
+ uv2_3_hdr->base_dest_nasid =
UV_PNODE_TO_NASID(base_pnode);
- uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID;
- uv2_hdr->command = UV_NET_ENDPOINT_INTD;
+ uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID;
+ uv2_3_hdr->command = UV_NET_ENDPOINT_INTD;
}
}
for_each_present_cpu(cpu) {
@@ -1858,6 +1861,7 @@ static int calculate_destination_timeout(void)
ts_ns *= (mult1 * mult2);
ret = ts_ns / 1000;
} else {
+ /* same destination timeout for uv2 and uv3 */
/* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */
mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
@@ -2012,8 +2016,10 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
bcp->uvhub_version = 1;
else if (is_uv2_hub())
bcp->uvhub_version = 2;
+ else if (is_uv3_hub())
+ bcp->uvhub_version = 3;
else {
- printk(KERN_EMERG "uvhub version not 1 or 2\n");
+ printk(KERN_EMERG "uvhub version not 1, 2 or 3\n");
return 1;
}
bcp->uvhub_master = *hmasterp;
@@ -2138,9 +2144,10 @@ static int __init uv_bau_init(void)
}
vector = UV_BAU_MESSAGE;
- for_each_possible_blade(uvhub)
+ for_each_possible_blade(uvhub) {
if (uv_blade_nr_possible_cpus(uvhub))
init_uvhub(uvhub, vector, uv_base_pnode);
+ }
alloc_intr_gate(vector, uv_bau_message_intr1);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 424f4c97a44..6ec7910f59b 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -165,7 +165,7 @@ static void fix_processor_context(void)
* by __save_processor_state()
* @ctxt - structure to load the registers contents from
*/
-static void __restore_processor_state(struct saved_context *ctxt)
+static void notrace __restore_processor_state(struct saved_context *ctxt)
{
if (ctxt->misc_enable_saved)
wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable);
@@ -239,7 +239,7 @@ static void __restore_processor_state(struct saved_context *ctxt)
}
/* Needed by apm.c */
-void restore_processor_state(void)
+void notrace restore_processor_state(void)
{
__restore_processor_state(&saved_context);
}
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
new file mode 100644
index 00000000000..7fde9ee438a
--- /dev/null
+++ b/arch/x86/purgatory/Makefile
@@ -0,0 +1,30 @@
+purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o
+
+targets += $(purgatory-y)
+PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
+
+LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
+targets += purgatory.ro
+
+# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
+# in turn leaves some undefined symbols like __fentry__ in purgatory and not
+# sure how to relocate those. Like kexec-tools, use custom flags.
+
+KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large
+
+$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
+ $(call if_changed,ld)
+
+targets += kexec-purgatory.c
+
+quiet_cmd_bin2c = BIN2C $@
+ cmd_bin2c = cat $(obj)/purgatory.ro | $(objtree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c
+
+$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
+ $(call if_changed,bin2c)
+
+
+# No loaders for 32bits yet.
+ifeq ($(CONFIG_X86_64),y)
+ obj-$(CONFIG_KEXEC) += kexec-purgatory.o
+endif
diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S
new file mode 100644
index 00000000000..d1a4291d356
--- /dev/null
+++ b/arch/x86/purgatory/entry64.S
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+
+ * Author(s): Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .balign 16
+ .code64
+ .globl entry64, entry64_regs
+
+
+entry64:
+ /* Setup a gdt that should be preserved */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup new stack */
+ leaq stack_init(%rip), %rsp
+ pushq $0x10 /* CS */
+ leaq new_cs_exit(%rip), %rax
+ pushq %rax
+ lretq
+new_cs_exit:
+
+ /* Load the registers */
+ movq rax(%rip), %rax
+ movq rbx(%rip), %rbx
+ movq rcx(%rip), %rcx
+ movq rdx(%rip), %rdx
+ movq rsi(%rip), %rsi
+ movq rdi(%rip), %rdi
+ movq rsp(%rip), %rsp
+ movq rbp(%rip), %rbp
+ movq r8(%rip), %r8
+ movq r9(%rip), %r9
+ movq r10(%rip), %r10
+ movq r11(%rip), %r11
+ movq r12(%rip), %r12
+ movq r13(%rip), %r13
+ movq r14(%rip), %r14
+ movq r15(%rip), %r15
+
+ /* Jump to the new code... */
+ jmpq *rip(%rip)
+
+ .section ".rodata"
+ .balign 4
+entry64_regs:
+rax: .quad 0x0
+rcx: .quad 0x0
+rdx: .quad 0x0
+rbx: .quad 0x0
+rsp: .quad 0x0
+rbp: .quad 0x0
+rsi: .quad 0x0
+rdi: .quad 0x0
+r8: .quad 0x0
+r9: .quad 0x0
+r10: .quad 0x0
+r11: .quad 0x0
+r12: .quad 0x0
+r13: .quad 0x0
+r14: .quad 0x0
+r15: .quad 0x0
+rip: .quad 0x0
+ .size entry64_regs, . - entry64_regs
+
+ /* GDT */
+ .section ".rodata"
+ .balign 16
+gdt:
+ /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+stack: .quad 0, 0
+stack_init:
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
new file mode 100644
index 00000000000..25e068ba338
--- /dev/null
+++ b/arch/x86/purgatory/purgatory.c
@@ -0,0 +1,72 @@
+/*
+ * purgatory: Runs between two kernels
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "sha256.h"
+#include "../boot/string.h"
+
+struct sha_region {
+ unsigned long start;
+ unsigned long len;
+};
+
+unsigned long backup_dest = 0;
+unsigned long backup_src = 0;
+unsigned long backup_sz = 0;
+
+u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 };
+
+struct sha_region sha_regions[16] = {};
+
+/*
+ * On x86, second kernel requries first 640K of memory to boot. Copy
+ * first 640K to a backup region in reserved memory range so that second
+ * kernel can use first 640K.
+ */
+static int copy_backup_region(void)
+{
+ if (backup_dest)
+ memcpy((void *)backup_dest, (void *)backup_src, backup_sz);
+
+ return 0;
+}
+
+int verify_sha256_digest(void)
+{
+ struct sha_region *ptr, *end;
+ u8 digest[SHA256_DIGEST_SIZE];
+ struct sha256_state sctx;
+
+ sha256_init(&sctx);
+ end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])];
+ for (ptr = sha_regions; ptr < end; ptr++)
+ sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
+
+ sha256_final(&sctx, digest);
+
+ if (memcmp(digest, sha256_digest, sizeof(digest)))
+ return 1;
+
+ return 0;
+}
+
+void purgatory(void)
+{
+ int ret;
+
+ ret = verify_sha256_digest();
+ if (ret) {
+ /* loop forever */
+ for (;;)
+ ;
+ }
+ copy_backup_region();
+}
diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S
new file mode 100644
index 00000000000..fe3c91ba1bd
--- /dev/null
+++ b/arch/x86/purgatory/setup-x86_64.S
@@ -0,0 +1,58 @@
+/*
+ * purgatory: setup code
+ *
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .globl purgatory_start
+ .balign 16
+purgatory_start:
+ .code64
+
+ /* Load a gdt so I know what the segment registers are */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup a stack */
+ leaq lstack_end(%rip), %rsp
+
+ /* Call the C code */
+ call purgatory
+ jmp entry64
+
+ .section ".rodata"
+ .balign 16
+gdt: /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as the gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+
+ .bss
+ .balign 4096
+lstack:
+ .skip 4096
+lstack_end:
diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c
new file mode 100644
index 00000000000..548ca675a14
--- /dev/null
+++ b/arch/x86/purgatory/sha256.c
@@ -0,0 +1,283 @@
+/*
+ * SHA-256, as specified in
+ * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
+ *
+ * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include "sha256.h"
+#include "../boot/string.h"
+
+static inline u32 Ch(u32 x, u32 y, u32 z)
+{
+ return z ^ (x & (y ^ z));
+}
+
+static inline u32 Maj(u32 x, u32 y, u32 z)
+{
+ return (x & y) | (z & (x | y));
+}
+
+#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
+#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
+#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
+#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
+
+static inline void LOAD_OP(int I, u32 *W, const u8 *input)
+{
+ W[I] = __be32_to_cpu(((__be32 *)(input))[I]);
+}
+
+static inline void BLEND_OP(int I, u32 *W)
+{
+ W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+}
+
+static void sha256_transform(u32 *state, const u8 *input)
+{
+ u32 a, b, c, d, e, f, g, h, t1, t2;
+ u32 W[64];
+ int i;
+
+ /* load the input */
+ for (i = 0; i < 16; i++)
+ LOAD_OP(i, W, input);
+
+ /* now blend */
+ for (i = 16; i < 64; i++)
+ BLEND_OP(i, W);
+
+ /* load the state into our registers */
+ a = state[0]; b = state[1]; c = state[2]; d = state[3];
+ e = state[4]; f = state[5]; g = state[6]; h = state[7];
+
+ /* now iterate */
+ t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ state[0] += a; state[1] += b; state[2] += c; state[3] += d;
+ state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+
+ /* clear any sensitive info... */
+ a = b = c = d = e = f = g = h = t1 = t2 = 0;
+ memset(W, 0, 64 * sizeof(u32));
+}
+
+int sha256_init(struct sha256_state *sctx)
+{
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+{
+ unsigned int partial, done;
+ const u8 *src;
+
+ partial = sctx->count & 0x3f;
+ sctx->count += len;
+ done = 0;
+ src = data;
+
+ if ((partial + len) > 63) {
+ if (partial) {
+ done = -partial;
+ memcpy(sctx->buf + partial, data, done + 64);
+ src = sctx->buf;
+ }
+
+ do {
+ sha256_transform(sctx->state, src);
+ done += 64;
+ src = data + done;
+ } while (done + 63 < len);
+
+ partial = 0;
+ }
+ memcpy(sctx->buf + partial, src, len - done);
+
+ return 0;
+}
+
+int sha256_final(struct sha256_state *sctx, u8 *out)
+{
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ unsigned int index, pad_len;
+ int i;
+ static const u8 padding[64] = { 0x80, };
+
+ /* Save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64. */
+ index = sctx->count & 0x3f;
+ pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
+ sha256_update(sctx, padding, pad_len);
+
+ /* Append length (before padding) */
+ sha256_update(sctx, (const u8 *)&bits, sizeof(bits));
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Zeroize sensitive information. */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h
new file mode 100644
index 00000000000..bd15a412773
--- /dev/null
+++ b/arch/x86/purgatory/sha256.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author: Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#ifndef SHA256_H
+#define SHA256_H
+
+
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+extern int sha256_init(struct sha256_state *sctx);
+extern int sha256_update(struct sha256_state *sctx, const u8 *input,
+ unsigned int length);
+extern int sha256_final(struct sha256_state *sctx, u8 *hash);
+
+#endif /* SHA256_H */
diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S
new file mode 100644
index 00000000000..3cefba1fefc
--- /dev/null
+++ b/arch/x86/purgatory/stack.S
@@ -0,0 +1,19 @@
+/*
+ * purgatory: stack
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ /* A stack for the loaded kernel.
+ * Seperate and in the data section so it can be prepopulated.
+ */
+ .data
+ .balign 4096
+ .globl stack, stack_end
+
+stack:
+ .skip 4096
+stack_end:
diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c
new file mode 100644
index 00000000000..d886b1fa36f
--- /dev/null
+++ b/arch/x86/purgatory/string.c
@@ -0,0 +1,13 @@
+/*
+ * Simple string functions.
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "../boot/string.c"
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b86792161..028b78168d8 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,6 @@
351 i386 sched_setattr sys_sched_setattr
352 i386 sched_getattr sys_sched_getattr
353 i386 renameat2 sys_renameat2
+354 i386 seccomp sys_seccomp
+355 i386 getrandom sys_getrandom
+356 i386 memfd_create sys_memfd_create
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1646d..35dd922727b 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,10 @@
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
+317 common seccomp sys_seccomp
+318 common getrandom sys_getrandom
+319 common memfd_create sys_memfd_create
+320 common kexec_file_load sys_kexec_file_load
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 0feee2fd507..25a1022dd79 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -216,6 +216,5 @@ extern long elf_aux_hwcap;
#define ELF_HWCAP (elf_aux_hwcap)
#define SET_PERSONALITY(ex) do ; while(0)
-#define __HAVE_ARCH_GATE_AREA 1
#endif
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 04f82e020f2..2a206d2b14a 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -25,7 +25,8 @@ static inline void rep_nop(void)
__asm__ __volatile__("rep;nop": : :"memory");
}
-#define cpu_relax() rep_nop()
+#define cpu_relax() rep_nop()
+#define cpu_relax_lowlatency() cpu_relax()
#include <asm/processor-generic.h>
diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c
index c6492e75797..f8fecaddcc0 100644
--- a/arch/x86/um/mem_64.c
+++ b/arch/x86/um/mem_64.c
@@ -9,18 +9,3 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return NULL;
}
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 5e04a1c899f..79d824551c1 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -370,13 +370,12 @@ struct rt_sigframe
char retcode[8];
};
-int setup_signal_stack_sc(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs *regs,
- sigset_t *mask)
+int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
{
struct sigframe __user *frame;
void __user *restorer;
- int err = 0;
+ int err = 0, sig = ksig->sig;
/* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */
stack_top = ((stack_top + 4) & -16UL) - 4;
@@ -385,8 +384,8 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
return 1;
restorer = frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
err |= __put_user(sig, &frame->sig);
@@ -410,20 +409,19 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
return err;
PT_REGS_SP(regs) = (unsigned long) frame;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
PT_REGS_AX(regs) = (unsigned long) sig;
PT_REGS_DX(regs) = (unsigned long) 0;
PT_REGS_CX(regs) = (unsigned long) 0;
return 0;
}
-int setup_signal_stack_si(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs *regs,
- siginfo_t *info, sigset_t *mask)
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
{
struct rt_sigframe __user *frame;
void __user *restorer;
- int err = 0;
+ int err = 0, sig = ksig->sig;
stack_top &= -8UL;
frame = (struct rt_sigframe __user *) stack_top - 1;
@@ -431,14 +429,14 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return 1;
restorer = frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
err |= __put_user(sig, &frame->sig);
err |= __put_user(&frame->info, &frame->pinfo);
err |= __put_user(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask,
PT_REGS_SP(regs));
@@ -457,7 +455,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return err;
PT_REGS_SP(regs) = (unsigned long) frame;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
PT_REGS_AX(regs) = (unsigned long) sig;
PT_REGS_DX(regs) = (unsigned long) &frame->info;
PT_REGS_CX(regs) = (unsigned long) &frame->uc;
@@ -502,12 +500,11 @@ struct rt_sigframe
struct _fpstate fpstate;
};
-int setup_signal_stack_si(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs * regs,
- siginfo_t *info, sigset_t *set)
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *set)
{
struct rt_sigframe __user *frame;
- int err = 0;
+ int err = 0, sig = ksig->sig;
frame = (struct rt_sigframe __user *)
round_down(stack_top - sizeof(struct rt_sigframe), 16);
@@ -517,8 +514,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto out;
- if (ka->sa.sa_flags & SA_SIGINFO) {
- err |= copy_siginfo_to_user(&frame->info, info);
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
if (err)
goto out;
}
@@ -543,8 +540,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
* already in userspace.
*/
/* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER)
- err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode);
else
/* could use a vstub here */
return err;
@@ -570,7 +567,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
*/
PT_REGS_SI(regs) = (unsigned long) &frame->info;
PT_REGS_DX(regs) = (unsigned long) &frame->uc;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
out:
return err;
}
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 3c0809a0631..5a4affe025e 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -10,8 +10,7 @@ VDSO32-$(CONFIG_X86_32) := y
VDSO32-$(CONFIG_COMPAT) := y
# files to link into the vdso
-vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vdso-fakesections.o
-vobjs-nox32 := vdso-fakesections.o
+vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
# files to link into kernel
obj-y += vma.o
@@ -38,7 +37,8 @@ vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg)
obj-y += $(vdso_img_objs)
targets += $(vdso_img_cfiles)
targets += $(vdso_img_sodbg)
-.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c)
+.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \
+ $(vdso_img-y:%=$(obj)/vdso%.so)
export CPPFLAGS_vdso.lds += -P -C
@@ -55,10 +55,10 @@ hostprogs-y += vdso2c
quiet_cmd_vdso2c = VDSO2C $@
define cmd_vdso2c
- $(obj)/vdso2c $< $@
+ $(obj)/vdso2c $< $(<:%.dbg=%) $@
endef
-$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE
+$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
$(call if_changed,vdso2c)
#
@@ -67,7 +67,8 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE
#
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
- -fno-omit-frame-pointer -foptimize-sibling-calls
+ -fno-omit-frame-pointer -foptimize-sibling-calls \
+ -DDISABLE_BRANCH_PROFILING
$(vobjs): KBUILD_CFLAGS += $(CFL)
@@ -113,6 +114,10 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE
targets += vdsox32.lds $(vobjx32s-y)
+$(obj)/%.so: OBJCOPYFLAGS := -S
+$(obj)/%.so: $(obj)/%.so.dbg
+ $(call if_changed,objcopy)
+
$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
@@ -150,6 +155,7 @@ KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
+KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
@@ -169,14 +175,24 @@ quiet_cmd_vdso = VDSO $@
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
- -Wl,-Bsymbolic $(LTO_CFLAGS)
+ $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
GCOV_PROFILE := n
#
-# Install the unstripped copies of vdso*.so.
+# Install the unstripped copies of vdso*.so. If our toolchain supports
+# build-id, install .build-id links as well.
#
quiet_cmd_vdso_install = INSTALL $(@:install_%=%)
- cmd_vdso_install = cp $< $(MODLIB)/vdso/$(@:install_%=%)
+define cmd_vdso_install
+ cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \
+ if readelf -n $< |grep -q 'Build ID'; then \
+ buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \
+ first=`echo $$buildid | cut -b-2`; \
+ last=`echo $$buildid | cut -b3-`; \
+ mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \
+ ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \
+ fi
+endef
vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index b2e4f493e5b..9793322751e 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -11,9 +11,6 @@
* Check with readelf after changing.
*/
-/* Disable profiling for userspace code: */
-#define DISABLE_BRANCH_PROFILING
-
#include <uapi/linux/time.h>
#include <asm/vgtod.h>
#include <asm/hpet.h>
diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c
deleted file mode 100644
index cb8a8d72c24..00000000000
--- a/arch/x86/vdso/vdso-fakesections.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright 2014 Andy Lutomirski
- * Subject to the GNU Public License, v.2
- *
- * Hack to keep broken Go programs working.
- *
- * The Go runtime had a couple of bugs: it would read the section table to try
- * to figure out how many dynamic symbols there were (it shouldn't have looked
- * at the section table at all) and, if there were no SHT_SYNDYM section table
- * entry, it would use an uninitialized value for the number of symbols. As a
- * workaround, we supply a minimal section table. vdso2c will adjust the
- * in-memory image so that "vdso_fake_sections" becomes the section table.
- *
- * The bug was introduced by:
- * https://code.google.com/p/go/source/detail?r=56ea40aac72b (2012-08-31)
- * and is being addressed in the Go runtime in this issue:
- * https://code.google.com/p/go/issues/detail?id=8197
- */
-
-#ifndef __x86_64__
-#error This hack is specific to the 64-bit vDSO
-#endif
-
-#include <linux/elf.h>
-
-extern const __visible struct elf64_shdr vdso_fake_sections[];
-const __visible struct elf64_shdr vdso_fake_sections[] = {
- {
- .sh_type = SHT_DYNSYM,
- .sh_entsize = sizeof(Elf64_Sym),
- }
-};
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S
index 2ec72f651eb..de2c921025f 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/vdso/vdso-layout.lds.S
@@ -6,8 +6,37 @@
* This script controls its layout.
*/
+#if defined(BUILD_VDSO64)
+# define SHDR_SIZE 64
+#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32)
+# define SHDR_SIZE 40
+#else
+# error unknown VDSO target
+#endif
+
+#define NUM_FAKE_SHDRS 13
+
SECTIONS
{
+ /*
+ * User/kernel shared data is before the vDSO. This may be a little
+ * uglier than putting it after the vDSO, but it avoids issues with
+ * non-allocatable things that dangle past the end of the PT_LOAD
+ * segment.
+ */
+
+ vvar_start = . - 2 * PAGE_SIZE;
+ vvar_page = vvar_start;
+
+ /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+#undef EMIT_VVAR
+
+ hpet_page = vvar_start + PAGE_SIZE;
+
. = SIZEOF_HEADERS;
.hash : { *(.hash) } :text
@@ -18,63 +47,56 @@ SECTIONS
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
+ .dynamic : { *(.dynamic) } :text :dynamic
+
+ .rodata : {
+ *(.rodata*)
+ *(.data*)
+ *(.sdata*)
+ *(.got.plt) *(.got)
+ *(.gnu.linkonce.d.*)
+ *(.bss*)
+ *(.dynbss*)
+ *(.gnu.linkonce.b.*)
+
+ /*
+ * Ideally this would live in a C file, but that won't
+ * work cleanly for x32 until we start building the x32
+ * C code using an x32 toolchain.
+ */
+ VDSO_FAKE_SECTION_TABLE_START = .;
+ . = . + NUM_FAKE_SHDRS * SHDR_SIZE;
+ VDSO_FAKE_SECTION_TABLE_END = .;
+ } :text
+
+ .fake_shstrtab : { *(.fake_shstrtab) } :text
+
+
.note : { *(.note.*) } :text :note
.eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
.eh_frame : { KEEP (*(.eh_frame)) } :text
- .dynamic : { *(.dynamic) } :text :dynamic
-
- .rodata : { *(.rodata*) } :text
- .data : {
- *(.data*)
- *(.sdata*)
- *(.got.plt) *(.got)
- *(.gnu.linkonce.d.*)
- *(.bss*)
- *(.dynbss*)
- *(.gnu.linkonce.b.*)
- }
-
- .altinstructions : { *(.altinstructions) }
- .altinstr_replacement : { *(.altinstr_replacement) }
/*
- * Align the actual code well away from the non-instruction data.
- * This is the best thing for the I-cache.
+ * Text is well-separated from actual data: there's plenty of
+ * stuff that isn't used at runtime in between.
*/
- . = ALIGN(0x100);
.text : { *(.text*) } :text =0x90909090,
/*
- * The remainder of the vDSO consists of special pages that are
- * shared between the kernel and userspace. It needs to be at the
- * end so that it doesn't overlap the mapping of the actual
- * vDSO image.
+ * At the end so that eu-elflint stays happy when vdso2c strips
+ * these. A better implementation would avoid allocating space
+ * for these.
*/
-
- . = ALIGN(PAGE_SIZE);
- vvar_page = .;
-
- /* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
-#define __VVAR_KERNEL_LDS
-#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
-#undef EMIT_VVAR
-
- . = vvar_page + PAGE_SIZE;
-
- hpet_page = .;
- . = . + PAGE_SIZE;
-
- . = ALIGN(PAGE_SIZE);
- end_mapping = .;
+ .altinstructions : { *(.altinstructions) } :text
+ .altinstr_replacement : { *(.altinstr_replacement) } :text
/DISCARD/ : {
*(.discard)
*(.discard.*)
+ *(__bug_table)
}
}
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 75e3404c83b..6807932643c 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -6,6 +6,8 @@
* the DSO.
*/
+#define BUILD_VDSO64
+
#include "vdso-layout.lds.S"
/*
diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c
index 7a6bf50f916..8627db24a7f 100644
--- a/arch/x86/vdso/vdso2c.c
+++ b/arch/x86/vdso/vdso2c.c
@@ -1,3 +1,53 @@
+/*
+ * vdso2c - A vdso image preparation tool
+ * Copyright (c) 2014 Andy Lutomirski and others
+ * Licensed under the GPL v2
+ *
+ * vdso2c requires stripped and unstripped input. It would be trivial
+ * to fully strip the input in here, but, for reasons described below,
+ * we need to write a section table. Doing this is more or less
+ * equivalent to dropping all non-allocatable sections, but it's
+ * easier to let objcopy handle that instead of doing it ourselves.
+ * If we ever need to do something fancier than what objcopy provides,
+ * it would be straightforward to add here.
+ *
+ * We're keep a section table for a few reasons:
+ *
+ * The Go runtime had a couple of bugs: it would read the section
+ * table to try to figure out how many dynamic symbols there were (it
+ * shouldn't have looked at the section table at all) and, if there
+ * were no SHT_SYNDYM section table entry, it would use an
+ * uninitialized value for the number of symbols. An empty DYNSYM
+ * table would work, but I see no reason not to write a valid one (and
+ * keep full performance for old Go programs). This hack is only
+ * needed on x86_64.
+ *
+ * The bug was introduced on 2012-08-31 by:
+ * https://code.google.com/p/go/source/detail?r=56ea40aac72b
+ * and was fixed on 2014-06-13 by:
+ * https://code.google.com/p/go/source/detail?r=fc1cd5e12595
+ *
+ * Binutils has issues debugging the vDSO: it reads the section table to
+ * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which
+ * would break build-id if we removed the section table. Binutils
+ * also requires that shstrndx != 0. See:
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=17064
+ *
+ * elfutils might not look for PT_NOTE if there is a section table at
+ * all. I don't know whether this matters for any practical purpose.
+ *
+ * For simplicity, rather than hacking up a partial section table, we
+ * just write a mostly complete one. We omit non-dynamic symbols,
+ * though, since they're rather large.
+ *
+ * Once binutils gets fixed, we might be able to drop this for all but
+ * the 64-bit vdso, since build-id only works in kernel RPMs, and
+ * systems that update to new enough kernel RPMs will likely update
+ * binutils in sync. build-id has never worked for home-built kernel
+ * RPMs without manual symlinking, and I suspect that no one ever does
+ * that.
+ */
+
#include <inttypes.h>
#include <stdint.h>
#include <unistd.h>
@@ -20,9 +70,11 @@ const char *outfilename;
/* Symbols that we need in vdso2c. */
enum {
+ sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
- sym_end_mapping,
+ sym_VDSO_FAKE_SECTION_TABLE_START,
+ sym_VDSO_FAKE_SECTION_TABLE_END,
};
const int special_pages[] = {
@@ -30,15 +82,26 @@ const int special_pages[] = {
sym_hpet_page,
};
-char const * const required_syms[] = {
- [sym_vvar_page] = "vvar_page",
- [sym_hpet_page] = "hpet_page",
- [sym_end_mapping] = "end_mapping",
- "VDSO32_NOTE_MASK",
- "VDSO32_SYSENTER_RETURN",
- "__kernel_vsyscall",
- "__kernel_sigreturn",
- "__kernel_rt_sigreturn",
+struct vdso_sym {
+ const char *name;
+ bool export;
+};
+
+struct vdso_sym required_syms[] = {
+ [sym_vvar_start] = {"vvar_start", true},
+ [sym_vvar_page] = {"vvar_page", true},
+ [sym_hpet_page] = {"hpet_page", true},
+ [sym_VDSO_FAKE_SECTION_TABLE_START] = {
+ "VDSO_FAKE_SECTION_TABLE_START", false
+ },
+ [sym_VDSO_FAKE_SECTION_TABLE_END] = {
+ "VDSO_FAKE_SECTION_TABLE_END", false
+ },
+ {"VDSO32_NOTE_MASK", true},
+ {"VDSO32_SYSENTER_RETURN", true},
+ {"__kernel_vsyscall", true},
+ {"__kernel_sigreturn", true},
+ {"__kernel_rt_sigreturn", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
@@ -48,7 +111,8 @@ static void fail(const char *format, ...)
va_start(ap, format);
fprintf(stderr, "Error: ");
vfprintf(stderr, format, ap);
- unlink(outfilename);
+ if (outfilename)
+ unlink(outfilename);
exit(1);
va_end(ap);
}
@@ -83,62 +147,71 @@ extern void bad_put_le(void);
#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0]))
-#define BITS 64
-#define GOFUNC go64
-#define Elf_Ehdr Elf64_Ehdr
-#define Elf_Shdr Elf64_Shdr
-#define Elf_Phdr Elf64_Phdr
-#define Elf_Sym Elf64_Sym
-#define Elf_Dyn Elf64_Dyn
+#define BITSFUNC3(name, bits, suffix) name##bits##suffix
+#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix)
+#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, )
+
+#define INT_BITS BITSFUNC2(int, ELF_BITS, _t)
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+#define ELF_BITS 64
#include "vdso2c.h"
-#undef BITS
-#undef GOFUNC
-#undef Elf_Ehdr
-#undef Elf_Shdr
-#undef Elf_Phdr
-#undef Elf_Sym
-#undef Elf_Dyn
-
-#define BITS 32
-#define GOFUNC go32
-#define Elf_Ehdr Elf32_Ehdr
-#define Elf_Shdr Elf32_Shdr
-#define Elf_Phdr Elf32_Phdr
-#define Elf_Sym Elf32_Sym
-#define Elf_Dyn Elf32_Dyn
+#undef ELF_BITS
+
+#define ELF_BITS 32
#include "vdso2c.h"
-#undef BITS
-#undef GOFUNC
-#undef Elf_Ehdr
-#undef Elf_Shdr
-#undef Elf_Phdr
-#undef Elf_Sym
-#undef Elf_Dyn
-
-static void go(void *addr, size_t len, FILE *outfile, const char *name)
+#undef ELF_BITS
+
+static void go(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
{
- Elf64_Ehdr *hdr = (Elf64_Ehdr *)addr;
+ Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr;
if (hdr->e_ident[EI_CLASS] == ELFCLASS64) {
- go64(addr, len, outfile, name);
+ go64(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
} else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) {
- go32(addr, len, outfile, name);
+ go32(raw_addr, raw_len, stripped_addr, stripped_len,
+ outfile, name);
} else {
fail("unknown ELF class\n");
}
}
+static void map_input(const char *name, void **addr, size_t *len, int prot)
+{
+ off_t tmp_len;
+
+ int fd = open(name, O_RDONLY);
+ if (fd == -1)
+ err(1, "%s", name);
+
+ tmp_len = lseek(fd, 0, SEEK_END);
+ if (tmp_len == (off_t)-1)
+ err(1, "lseek");
+ *len = (size_t)tmp_len;
+
+ *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0);
+ if (*addr == MAP_FAILED)
+ err(1, "mmap");
+
+ close(fd);
+}
+
int main(int argc, char **argv)
{
- int fd;
- off_t len;
- void *addr;
+ size_t raw_len, stripped_len;
+ void *raw_addr, *stripped_addr;
FILE *outfile;
char *name, *tmp;
int namelen;
- if (argc != 3) {
- printf("Usage: vdso2c INPUT OUTPUT\n");
+ if (argc != 4) {
+ printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n");
return 1;
}
@@ -146,7 +219,7 @@ int main(int argc, char **argv)
* Figure out the struct name. If we're writing to a .so file,
* generate raw output insted.
*/
- name = strdup(argv[2]);
+ name = strdup(argv[3]);
namelen = strlen(name);
if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) {
name = NULL;
@@ -162,26 +235,18 @@ int main(int argc, char **argv)
*tmp = '_';
}
- fd = open(argv[1], O_RDONLY);
- if (fd == -1)
- err(1, "%s", argv[1]);
-
- len = lseek(fd, 0, SEEK_END);
- if (len == (off_t)-1)
- err(1, "lseek");
-
- addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
- if (addr == MAP_FAILED)
- err(1, "mmap");
+ map_input(argv[1], &raw_addr, &raw_len, PROT_READ);
+ map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ);
- outfilename = argv[2];
+ outfilename = argv[3];
outfile = fopen(outfilename, "w");
if (!outfile)
err(1, "%s", argv[2]);
- go(addr, (size_t)len, outfile, name);
+ go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name);
- munmap(addr, len);
+ munmap(raw_addr, raw_len);
+ munmap(stripped_addr, stripped_len);
fclose(outfile);
return 0;
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h
index c6eefaf389b..fd57829b30d 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/vdso/vdso2c.h
@@ -4,23 +4,23 @@
* are built for 32-bit userspace.
*/
-static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
+static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
+ void *stripped_addr, size_t stripped_len,
+ FILE *outfile, const char *name)
{
int found_load = 0;
unsigned long load_size = -1; /* Work around bogus warning */
- unsigned long data_size;
- Elf_Ehdr *hdr = (Elf_Ehdr *)addr;
+ unsigned long mapping_size;
+ ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr;
int i;
unsigned long j;
- Elf_Shdr *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
+ ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr,
*alt_sec = NULL;
- Elf_Dyn *dyn = 0, *dyn_end = 0;
+ ELF(Dyn) *dyn = 0, *dyn_end = 0;
const char *secstrings;
- uint64_t syms[NSYMS] = {};
+ INT_BITS syms[NSYMS] = {};
- uint64_t fake_sections_value = 0, fake_sections_size = 0;
-
- Elf_Phdr *pt = (Elf_Phdr *)(addr + GET_LE(&hdr->e_phoff));
+ ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff));
/* Walk the segment table. */
for (i = 0; i < GET_LE(&hdr->e_phnum); i++) {
@@ -38,30 +38,32 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
load_size = GET_LE(&pt[i].p_memsz);
found_load = 1;
} else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) {
- dyn = addr + GET_LE(&pt[i].p_offset);
- dyn_end = addr + GET_LE(&pt[i].p_offset) +
+ dyn = raw_addr + GET_LE(&pt[i].p_offset);
+ dyn_end = raw_addr + GET_LE(&pt[i].p_offset) +
GET_LE(&pt[i].p_memsz);
}
}
if (!found_load)
fail("no PT_LOAD seg\n");
- data_size = (load_size + 4095) / 4096 * 4096;
+
+ if (stripped_len < load_size)
+ fail("stripped input is too short\n");
/* Walk the dynamic table */
for (i = 0; dyn + i < dyn_end &&
GET_LE(&dyn[i].d_tag) != DT_NULL; i++) {
typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag);
- if (tag == DT_REL || tag == DT_RELSZ ||
+ if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA ||
tag == DT_RELENT || tag == DT_TEXTREL)
fail("vdso image contains dynamic relocations\n");
}
/* Walk the section table */
- secstrings_hdr = addr + GET_LE(&hdr->e_shoff) +
+ secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx);
- secstrings = addr + GET_LE(&secstrings_hdr->sh_offset);
+ secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset);
for (i = 0; i < GET_LE(&hdr->e_shnum); i++) {
- Elf_Shdr *sh = addr + GET_LE(&hdr->e_shoff) +
+ ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * i;
if (GET_LE(&sh->sh_type) == SHT_SYMTAB)
symtab_hdr = sh;
@@ -74,7 +76,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
if (!symtab_hdr)
fail("no symbol table\n");
- strtab_hdr = addr + GET_LE(&hdr->e_shoff) +
+ strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) +
GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link);
/* Walk the symbol table */
@@ -82,27 +84,27 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize);
i++) {
int k;
- Elf_Sym *sym = addr + GET_LE(&symtab_hdr->sh_offset) +
+ ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) +
GET_LE(&symtab_hdr->sh_entsize) * i;
- const char *name = addr + GET_LE(&strtab_hdr->sh_offset) +
+ const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) +
GET_LE(&sym->st_name);
for (k = 0; k < NSYMS; k++) {
- if (!strcmp(name, required_syms[k])) {
+ if (!strcmp(name, required_syms[k].name)) {
if (syms[k]) {
fail("duplicate symbol %s\n",
- required_syms[k]);
+ required_syms[k].name);
}
+
+ /*
+ * Careful: we use negative addresses, but
+ * st_value is unsigned, so we rely
+ * on syms[k] being a signed type of the
+ * correct width.
+ */
syms[k] = GET_LE(&sym->st_value);
}
}
-
- if (!strcmp(name, "vdso_fake_sections")) {
- if (fake_sections_value)
- fail("duplicate vdso_fake_sections\n");
- fake_sections_value = GET_LE(&sym->st_value);
- fake_sections_size = GET_LE(&sym->st_size);
- }
}
/* Validate mapping addresses. */
@@ -112,30 +114,24 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
if (syms[i] % 4096)
fail("%s must be a multiple of 4096\n",
- required_syms[i]);
- if (syms[i] < data_size)
- fail("%s must be after the text mapping\n",
- required_syms[i]);
- if (syms[sym_end_mapping] < syms[i] + 4096)
- fail("%s overruns end_mapping\n", required_syms[i]);
+ required_syms[i].name);
+ if (syms[sym_vvar_start] > syms[i] + 4096)
+ fail("%s underruns begin_vvar\n",
+ required_syms[i].name);
+ if (syms[i] + 4096 > 0)
+ fail("%s is on the wrong side of the vdso text\n",
+ required_syms[i].name);
}
- if (syms[sym_end_mapping] % 4096)
- fail("end_mapping must be a multiple of 4096\n");
-
- /* Remove sections or use fakes */
- if (fake_sections_size % sizeof(Elf_Shdr))
- fail("vdso_fake_sections size is not a multiple of %ld\n",
- (long)sizeof(Elf_Shdr));
- PUT_LE(&hdr->e_shoff, fake_sections_value);
- PUT_LE(&hdr->e_shentsize, fake_sections_value ? sizeof(Elf_Shdr) : 0);
- PUT_LE(&hdr->e_shnum, fake_sections_size / sizeof(Elf_Shdr));
- PUT_LE(&hdr->e_shstrndx, SHN_UNDEF);
+ if (syms[sym_vvar_start] % 4096)
+ fail("vvar_begin must be a multiple of 4096\n");
if (!name) {
- fwrite(addr, load_size, 1, outfile);
+ fwrite(stripped_addr, stripped_len, 1, outfile);
return;
}
+ mapping_size = (stripped_len + 4095) / 4096 * 4096;
+
fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n");
fprintf(outfile, "#include <linux/linkage.h>\n");
fprintf(outfile, "#include <asm/page_types.h>\n");
@@ -143,20 +139,21 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
fprintf(outfile, "\n");
fprintf(outfile,
"static unsigned char raw_data[%lu] __page_aligned_data = {",
- data_size);
- for (j = 0; j < load_size; j++) {
+ mapping_size);
+ for (j = 0; j < stripped_len; j++) {
if (j % 10 == 0)
fprintf(outfile, "\n\t");
- fprintf(outfile, "0x%02X, ", (int)((unsigned char *)addr)[j]);
+ fprintf(outfile, "0x%02X, ",
+ (int)((unsigned char *)stripped_addr)[j]);
}
fprintf(outfile, "\n};\n\n");
fprintf(outfile, "static struct page *pages[%lu];\n\n",
- data_size / 4096);
+ mapping_size / 4096);
fprintf(outfile, "const struct vdso_image %s = {\n", name);
fprintf(outfile, "\t.data = raw_data,\n");
- fprintf(outfile, "\t.size = %lu,\n", data_size);
+ fprintf(outfile, "\t.size = %lu,\n", mapping_size);
fprintf(outfile, "\t.text_mapping = {\n");
fprintf(outfile, "\t\t.name = \"[vdso]\",\n");
fprintf(outfile, "\t\t.pages = pages,\n");
@@ -168,9 +165,9 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name)
(unsigned long)GET_LE(&alt_sec->sh_size));
}
for (i = 0; i < NSYMS; i++) {
- if (syms[i])
- fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n",
- required_syms[i], syms[i]);
+ if (required_syms[i].export && syms[i])
+ fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n",
+ required_syms[i].name, (int64_t)syms[i]);
}
fprintf(outfile, "};\n");
}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index e4f7781ee16..e904c270573 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -115,23 +115,6 @@ static __init int ia32_binfmt_init(void)
return 0;
}
__initcall(ia32_binfmt_init);
-#endif
-
-#else /* CONFIG_X86_32 */
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
+#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/vdso/vdso32/vdso-fakesections.c
new file mode 100644
index 00000000000..541468e2526
--- /dev/null
+++ b/arch/x86/vdso/vdso32/vdso-fakesections.c
@@ -0,0 +1 @@
+#include "../vdso-fakesections.c"
diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S
index 46b991b578a..697c11ece90 100644
--- a/arch/x86/vdso/vdsox32.lds.S
+++ b/arch/x86/vdso/vdsox32.lds.S
@@ -6,6 +6,8 @@
* the DSO.
*/
+#define BUILD_VDSOX32
+
#include "vdso-layout.lds.S"
/*
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index e1513c47872..970463b566c 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -62,6 +62,9 @@ struct linux_binprm;
Only used for the 64-bit and x32 vdsos. */
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
+#ifdef CONFIG_X86_32
+ return 0;
+#else
unsigned long addr, end;
unsigned offset;
end = (start + PMD_SIZE - 1) & PMD_MASK;
@@ -83,13 +86,14 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
addr = align_vdso_addr(addr);
return addr;
+#endif
}
static int map_vdso(const struct vdso_image *image, bool calculate_addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long addr;
+ unsigned long addr, text_start;
int ret = 0;
static struct page *no_pages[] = {NULL};
static struct vm_special_mapping vvar_mapping = {
@@ -99,26 +103,28 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
- image->sym_end_mapping);
+ image->size - image->sym_vvar_start);
} else {
addr = 0;
}
down_write(&mm->mmap_sem);
- addr = get_unmapped_area(NULL, addr, image->sym_end_mapping, 0, 0);
+ addr = get_unmapped_area(NULL, addr,
+ image->size - image->sym_vvar_start, 0, 0);
if (IS_ERR_VALUE(addr)) {
ret = addr;
goto up_fail;
}
- current->mm->context.vdso = (void __user *)addr;
+ text_start = addr - image->sym_vvar_start;
+ current->mm->context.vdso = (void __user *)text_start;
/*
* MAYWRITE to allow gdb to COW and set breakpoints
*/
vma = _install_special_mapping(mm,
- addr,
+ text_start,
image->size,
VM_READ|VM_EXEC|
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
@@ -130,9 +136,9 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
}
vma = _install_special_mapping(mm,
- addr + image->size,
- image->sym_end_mapping - image->size,
- VM_READ,
+ addr,
+ -image->sym_vvar_start,
+ VM_READ|VM_MAYREAD,
&vvar_mapping);
if (IS_ERR(vma)) {
@@ -142,7 +148,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
if (image->sym_vvar_page)
ret = remap_pfn_range(vma,
- addr + image->sym_vvar_page,
+ text_start + image->sym_vvar_page,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT,
PAGE_SIZE,
PAGE_READONLY);
@@ -153,7 +159,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
#ifdef CONFIG_HPET_TIMER
if (hpet_address && image->sym_hpet_page) {
ret = io_remap_pfn_range(vma,
- addr + image->sym_hpet_page,
+ text_start + image->sym_hpet_page,
hpet_address >> PAGE_SHIFT,
PAGE_SIZE,
pgprot_noncached(PAGE_READONLY));
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c09cb6..7322755f337 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
+obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
new file mode 100644
index 00000000000..a02e09e18f5
--- /dev/null
+++ b/arch/x86/xen/efi.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Oracle Co., Daniel Kiper
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/string.h>
+
+#include <xen/xen-ops.h>
+
+#include <asm/setup.h>
+
+void __init xen_efi_init(void)
+{
+ efi_system_table_t *efi_systab_xen;
+
+ efi_systab_xen = xen_efi_probe();
+
+ if (efi_systab_xen == NULL)
+ return;
+
+ strncpy((char *)&boot_params.efi_info.efi_loader_signature, "Xen",
+ sizeof(boot_params.efi_info.efi_loader_signature));
+ boot_params.efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
+ boot_params.efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
+
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_PARAVIRT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
+}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ffb101e4573..c0cb11fb500 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1718,6 +1718,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_setup_runstate_info(0);
+ xen_efi_init();
+
/* Start the world */
#ifdef CONFIG_X86_32
i386_start_kernel();
@@ -1826,8 +1828,19 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_mmu_ops();
}
+static bool xen_nopv = false;
+static __init int xen_parse_nopv(char *arg)
+{
+ xen_nopv = true;
+ return 0;
+}
+early_param("xen_nopv", xen_parse_nopv);
+
static uint32_t __init xen_hvm_platform(void)
{
+ if (xen_nopv)
+ return 0;
+
if (xen_pv_domain())
return 0;
@@ -1836,6 +1849,8 @@ static uint32_t __init xen_hvm_platform(void)
bool xen_hvm_need_lapic(void)
{
+ if (xen_nopv)
+ return false;
if (xen_pv_domain())
return false;
if (!xen_hvm_domain())
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index c9858358858..c0413046483 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -36,99 +36,83 @@
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
+#include <xen/xen.h>
#include <asm/pgtable.h>
-static int map_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+static struct gnttab_vm_area {
+ struct vm_struct *area;
+ pte_t **ptes;
+} gnttab_shared_vm_area;
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ void **__shared)
{
- unsigned long **frames = (unsigned long **)data;
+ void *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
- return 0;
-}
+ if (shared == NULL)
+ *__shared = shared = gnttab_shared_vm_area.area->addr;
-/*
- * This function is used to map shared frames to store grant status. It is
- * different from map_pte_fn above, the frames type here is uint64_t.
- */
-static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
-{
- uint64_t **frames = (uint64_t **)data;
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
- set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
- (*frames)++;
return 0;
}
-static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
- unsigned long addr, void *data)
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
+ unsigned long addr;
+ unsigned long i;
- set_pte_at(&init_mm, addr, pte, __pte(0));
- return 0;
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ __pte(0));
+ addr += PAGE_SIZE;
+ }
}
-int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- void **__shared)
+static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
{
- int rc;
- void *shared = *__shared;
+ area->ptes = kmalloc(sizeof(pte_t *) * nr_frames, GFP_KERNEL);
+ if (area->ptes == NULL)
+ return -ENOMEM;
- if (shared == NULL) {
- struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
+ area->area = alloc_vm_area(PAGE_SIZE * nr_frames, area->ptes);
+ if (area->area == NULL) {
+ kfree(area->ptes);
+ return -ENOMEM;
}
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn, &frames);
- return rc;
+ return 0;
}
-int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- grant_status_t **__shared)
+int arch_gnttab_init(unsigned long nr_shared)
{
- int rc;
- grant_status_t *shared = *__shared;
-
- if (shared == NULL) {
- /* No need to pass in PTE as we are going to do it
- * in apply_to_page_range anyhow. */
- struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
- BUG_ON(area == NULL);
- shared = area->addr;
- *__shared = shared;
- }
+ if (!xen_pv_domain())
+ return 0;
- rc = apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes,
- map_pte_fn_status, &frames);
- return rc;
+ return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
}
-void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
-{
- apply_to_page_range(&init_mm, (unsigned long)shared,
- PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
-}
#ifdef CONFIG_XEN_PVH
#include <xen/balloon.h>
#include <xen/events.h>
-#include <xen/xen.h>
#include <linux/slab.h>
static int __init xlated_setup_gnttab_pages(void)
{
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 9bb3d82ffec..3172692381a 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -841,10 +841,9 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
pfn = ALIGN(pfn, P2M_PER_PAGE);
}
- if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+ WARN((pfn - pfn_s) != (pfn_e - pfn_s),
"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
- (pfn_e - pfn_s) - (pfn - pfn_s)))
- printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+ (pfn_e - pfn_s) - (pfn - pfn_s));
return pfn - pfn_s;
}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 97d87659f77..28c7e0be56e 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -105,6 +105,14 @@ static inline void __init xen_init_apic(void)
}
#endif
+#ifdef CONFIG_XEN_EFI
+extern void xen_efi_init(void);
+#else
+static inline void __init xen_efi_init(void)
+{
+}
+#endif
+
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \