aboutsummaryrefslogtreecommitdiff
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig477
-rw-r--r--arch/x86_64/Kconfig.debug57
-rw-r--r--arch/x86_64/Makefile119
-rw-r--r--arch/x86_64/boot/Makefile102
-rw-r--r--arch/x86_64/boot/bootsect.S98
-rw-r--r--arch/x86_64/boot/compressed/Makefile32
-rw-r--r--arch/x86_64/boot/compressed/head.S142
-rw-r--r--arch/x86_64/boot/compressed/misc.c354
-rw-r--r--arch/x86_64/boot/compressed/miscsetup.h39
-rw-r--r--arch/x86_64/boot/compressed/vmlinux.scr9
-rw-r--r--arch/x86_64/boot/install.sh40
-rw-r--r--arch/x86_64/boot/mtools.conf.in17
-rw-r--r--arch/x86_64/boot/setup.S867
-rw-r--r--arch/x86_64/boot/tools/build.c186
-rw-r--r--arch/x86_64/boot/video.S2007
-rw-r--r--arch/x86_64/defconfig1129
-rw-r--r--arch/x86_64/ia32/Makefile32
-rw-r--r--arch/x86_64/ia32/fpu32.c184
-rw-r--r--arch/x86_64/ia32/ia32_aout.c529
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c434
-rw-r--r--arch/x86_64/ia32/ia32_ioctl.c201
-rw-r--r--arch/x86_64/ia32/ia32_signal.c621
-rw-r--r--arch/x86_64/ia32/ia32entry.S602
-rw-r--r--arch/x86_64/ia32/ipc32.c57
-rw-r--r--arch/x86_64/ia32/ptrace32.c379
-rw-r--r--arch/x86_64/ia32/sys_ia32.c1050
-rw-r--r--arch/x86_64/ia32/syscall32.c111
-rw-r--r--arch/x86_64/ia32/tls32.c163
-rw-r--r--arch/x86_64/ia32/vsyscall-sigreturn.S120
-rw-r--r--arch/x86_64/ia32/vsyscall-syscall.S68
-rw-r--r--arch/x86_64/ia32/vsyscall-sysenter.S94
-rw-r--r--arch/x86_64/ia32/vsyscall.lds77
-rw-r--r--arch/x86_64/kernel/Makefile45
-rw-r--r--arch/x86_64/kernel/acpi/Makefile3
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c132
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S527
-rw-r--r--arch/x86_64/kernel/aperture.c286
-rw-r--r--arch/x86_64/kernel/apic.c1088
-rw-r--r--arch/x86_64/kernel/asm-offsets.c69
-rw-r--r--arch/x86_64/kernel/cpufreq/Kconfig96
-rw-r--r--arch/x86_64/kernel/cpufreq/Makefile17
-rw-r--r--arch/x86_64/kernel/e820.c513
-rw-r--r--arch/x86_64/kernel/early_printk.c220
-rw-r--r--arch/x86_64/kernel/entry.S920
-rw-r--r--arch/x86_64/kernel/genapic.c89
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c130
-rw-r--r--arch/x86_64/kernel/genapic_flat.c127
-rw-r--r--arch/x86_64/kernel/head.S396
-rw-r--r--arch/x86_64/kernel/head64.c117
-rw-r--r--arch/x86_64/kernel/i387.c155
-rw-r--r--arch/x86_64/kernel/i8259.c579
-rw-r--r--arch/x86_64/kernel/init_task.c49
-rw-r--r--arch/x86_64/kernel/io_apic.c1982
-rw-r--r--arch/x86_64/kernel/ioport.c117
-rw-r--r--arch/x86_64/kernel/irq.c108
-rw-r--r--arch/x86_64/kernel/kprobes.c631
-rw-r--r--arch/x86_64/kernel/ldt.c253
-rw-r--r--arch/x86_64/kernel/mce.c548
-rw-r--r--arch/x86_64/kernel/mce_intel.c99
-rw-r--r--arch/x86_64/kernel/module.c166
-rw-r--r--arch/x86_64/kernel/mpparse.c949
-rw-r--r--arch/x86_64/kernel/msr.c279
-rw-r--r--arch/x86_64/kernel/nmi.c488
-rw-r--r--arch/x86_64/kernel/pci-dma.c60
-rw-r--r--arch/x86_64/kernel/pci-gart.c980
-rw-r--r--arch/x86_64/kernel/pci-nommu.c94
-rw-r--r--arch/x86_64/kernel/process.c770
-rw-r--r--arch/x86_64/kernel/ptrace.c547
-rw-r--r--arch/x86_64/kernel/reboot.c163
-rw-r--r--arch/x86_64/kernel/semaphore.c180
-rw-r--r--arch/x86_64/kernel/setup.c1189
-rw-r--r--arch/x86_64/kernel/setup64.c292
-rw-r--r--arch/x86_64/kernel/signal.c486
-rw-r--r--arch/x86_64/kernel/smp.c415
-rw-r--r--arch/x86_64/kernel/smpboot.c938
-rw-r--r--arch/x86_64/kernel/suspend.c157
-rw-r--r--arch/x86_64/kernel/suspend_asm.S104
-rw-r--r--arch/x86_64/kernel/sys_x86_64.c173
-rw-r--r--arch/x86_64/kernel/syscall.c26
-rw-r--r--arch/x86_64/kernel/time.c1262
-rw-r--r--arch/x86_64/kernel/trampoline.S64
-rw-r--r--arch/x86_64/kernel/traps.c948
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S164
-rw-r--r--arch/x86_64/kernel/vsyscall.c225
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c221
-rw-r--r--arch/x86_64/lib/Makefile14
-rw-r--r--arch/x86_64/lib/bitops.c141
-rw-r--r--arch/x86_64/lib/bitstr.c28
-rw-r--r--arch/x86_64/lib/clear_page.S50
-rw-r--r--arch/x86_64/lib/copy_page.S101
-rw-r--r--arch/x86_64/lib/copy_user.S294
-rw-r--r--arch/x86_64/lib/csum-copy.S233
-rw-r--r--arch/x86_64/lib/csum-partial.c150
-rw-r--r--arch/x86_64/lib/csum-wrappers.c129
-rw-r--r--arch/x86_64/lib/dec_and_lock.c40
-rw-r--r--arch/x86_64/lib/delay.c48
-rw-r--r--arch/x86_64/lib/getuser.S101
-rw-r--r--arch/x86_64/lib/io.c23
-rw-r--r--arch/x86_64/lib/memcpy.S121
-rw-r--r--arch/x86_64/lib/memmove.c19
-rw-r--r--arch/x86_64/lib/memset.S125
-rw-r--r--arch/x86_64/lib/putuser.S89
-rw-r--r--arch/x86_64/lib/thunk.S95
-rw-r--r--arch/x86_64/lib/usercopy.c153
-rw-r--r--arch/x86_64/mm/Makefile11
-rw-r--r--arch/x86_64/mm/extable.c35
-rw-r--r--arch/x86_64/mm/fault.c579
-rw-r--r--arch/x86_64/mm/init.c630
-rw-r--r--arch/x86_64/mm/ioremap.c283
-rw-r--r--arch/x86_64/mm/k8topology.c168
-rw-r--r--arch/x86_64/mm/numa.c294
-rw-r--r--arch/x86_64/mm/pageattr.c235
-rw-r--r--arch/x86_64/mm/srat.c217
-rw-r--r--arch/x86_64/oprofile/Kconfig23
-rw-r--r--arch/x86_64/oprofile/Makefile19
-rw-r--r--arch/x86_64/pci/Makefile24
-rw-r--r--arch/x86_64/pci/Makefile-BUS22
-rw-r--r--arch/x86_64/pci/k8-bus.c78
-rw-r--r--arch/x86_64/pci/mmconfig.c104
119 files changed, 35709 insertions, 0 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
new file mode 100644
index 000000000000..80c38c5d71fe
--- /dev/null
+++ b/arch/x86_64/Kconfig
@@ -0,0 +1,477 @@
+#
+# For a description of the syntax of this configuration file,
+# see Documentation/kbuild/kconfig-language.txt.
+#
+# Note: ISA is disabled and will hopefully never be enabled.
+# If you managed to buy an ISA x86-64 box you'll have to fix all the
+# ISA drivers you need yourself.
+#
+
+mainmenu "Linux Kernel Configuration"
+
+config X86_64
+ bool
+ default y
+ help
+ Port to the x86-64 architecture. x86-64 is a 64-bit extension to the
+ classical 32-bit x86 architecture. For details see
+ <http://www.x86-64.org/>.
+
+config 64BIT
+ def_bool y
+
+config X86
+ bool
+ default y
+
+config MMU
+ bool
+ default y
+
+config ISA
+ bool
+
+config SBUS
+ bool
+
+config RWSEM_GENERIC_SPINLOCK
+ bool
+ default y
+
+config RWSEM_XCHGADD_ALGORITHM
+ bool
+
+config GENERIC_CALIBRATE_DELAY
+ bool
+ default y
+
+config X86_CMPXCHG
+ bool
+ default y
+
+config EARLY_PRINTK
+ bool
+ default y
+
+config GENERIC_ISA_DMA
+ bool
+ default y
+
+config GENERIC_IOMAP
+ bool
+ default y
+
+source "init/Kconfig"
+
+
+menu "Processor type and features"
+
+choice
+ prompt "Processor family"
+ default MK8
+
+config MK8
+ bool "AMD-Opteron/Athlon64"
+ help
+ Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs.
+
+config MPSC
+ bool "Intel EM64T"
+ help
+ Optimize for Intel Pentium 4 and Xeon CPUs with Intel
+ Extended Memory 64 Technology(EM64T). For details see
+ <http://www.intel.com/technology/64bitextensions/>.
+
+config GENERIC_CPU
+ bool "Generic-x86-64"
+ help
+ Generic x86-64 CPU.
+
+endchoice
+
+#
+# Define implied options from the CPU selection here
+#
+config X86_L1_CACHE_BYTES
+ int
+ default "128" if GENERIC_CPU || MPSC
+ default "64" if MK8
+
+config X86_L1_CACHE_SHIFT
+ int
+ default "7" if GENERIC_CPU || MPSC
+ default "6" if MK8
+
+config X86_TSC
+ bool
+ default y
+
+config X86_GOOD_APIC
+ bool
+ default y
+
+config MICROCODE
+ tristate "/dev/cpu/microcode - Intel CPU microcode support"
+ ---help---
+ If you say Y here the 'File systems' section, you will be
+ able to update the microcode on Intel processors. You will
+ obviously need the actual microcode binary data itself which is
+ not shipped with the Linux kernel.
+
+ For latest news and information on obtaining all the required
+ ingredients for this driver, check:
+ <http://www.urbanmyth.org/microcode/>.
+
+ To compile this driver as a module, choose M here: the
+ module will be called microcode.
+ If you use modprobe or kmod you may also want to add the line
+ 'alias char-major-10-184 microcode' to your /etc/modules.conf file.
+
+config X86_MSR
+ tristate "/dev/cpu/*/msr - Model-specific register support"
+ help
+ This device gives privileged processes access to the x86
+ Model-Specific Registers (MSRs). It is a character device with
+ major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
+ MSR accesses are directed to a specific CPU on multi-processor
+ systems.
+
+config X86_CPUID
+ tristate "/dev/cpu/*/cpuid - CPU information support"
+ help
+ This device gives processes access to the x86 CPUID instruction to
+ be executed on a specific processor. It is a character device
+ with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
+ /dev/cpu/31/cpuid.
+
+# disable it for opteron optimized builds because it pulls in ACPI_BOOT
+config X86_HT
+ bool
+ depends on SMP && !MK8
+ default y
+
+config MATH_EMULATION
+ bool
+
+config MCA
+ bool
+
+config EISA
+ bool
+
+config X86_IO_APIC
+ bool
+ default y
+
+config X86_LOCAL_APIC
+ bool
+ default y
+
+config MTRR
+ bool "MTRR (Memory Type Range Register) support"
+ ---help---
+ On Intel P6 family processors (Pentium Pro, Pentium II and later)
+ the Memory Type Range Registers (MTRRs) may be used to control
+ processor access to memory ranges. This is most useful if you have
+ a video (VGA) card on a PCI or AGP bus. Enabling write-combining
+ allows bus write transfers to be combined into a larger transfer
+ before bursting over the PCI/AGP bus. This can increase performance
+ of image write operations 2.5 times or more. Saying Y here creates a
+ /proc/mtrr file which may be used to manipulate your processor's
+ MTRRs. Typically the X server should use this.
+
+ This code has a reasonably generic interface so that similar
+ control registers on other processors can be easily supported
+ as well.
+
+ Saying Y here also fixes a problem with buggy SMP BIOSes which only
+ set the MTRRs for the boot CPU and not for the secondary CPUs. This
+ can lead to all sorts of problems, so it's good to say Y here.
+
+ Just say Y here, all x86-64 machines support MTRRs.
+
+ See <file:Documentation/mtrr.txt> for more information.
+
+config SMP
+ bool "Symmetric multi-processing support"
+ ---help---
+ This enables support for systems with more than one CPU. If you have
+ a system with only one CPU, like most personal computers, say N. If
+ you have a system with more than one CPU, say Y.
+
+ If you say N here, the kernel will run on single and multiprocessor
+ machines, but will use only one CPU of a multiprocessor machine. If
+ you say Y here, the kernel will run on many, but not all,
+ singleprocessor machines. On a singleprocessor machine, the kernel
+ will run faster if you say N here.
+
+ If you don't know what to do here, say N.
+
+config PREEMPT
+ bool "Preemptible Kernel"
+ ---help---
+ This option reduces the latency of the kernel when reacting to
+ real-time or interactive events by allowing a low priority process to
+ be preempted even if it is in kernel mode executing a system call.
+ This allows applications to run more reliably even when the system is
+ under load. On contrary it may also break your drivers and add
+ priority inheritance problems to your system. Don't select it if
+ you rely on a stable system or have slightly obscure hardware.
+ It's also not very well tested on x86-64 currently.
+ You have been warned.
+
+ Say Y here if you are feeling brave and building a kernel for a
+ desktop, embedded or real-time system. Say N if you are unsure.
+
+config PREEMPT_BKL
+ bool "Preempt The Big Kernel Lock"
+ depends on PREEMPT
+ default y
+ help
+ This option reduces the latency of the kernel by making the
+ big kernel lock preemptible.
+
+ Say Y here if you are building a kernel for a desktop system.
+ Say N if you are unsure.
+
+config SCHED_SMT
+ bool "SMT (Hyperthreading) scheduler support"
+ depends on SMP
+ default n
+ help
+ SMT scheduler support improves the CPU scheduler's decision making
+ when dealing with Intel Pentium 4 chips with HyperThreading at a
+ cost of slightly increased overhead in some places. If unsure say
+ N here.
+
+config K8_NUMA
+ bool "K8 NUMA support"
+ select NUMA
+ depends on SMP
+ help
+ Enable NUMA (Non Unified Memory Architecture) support for
+ AMD Opteron Multiprocessor systems. The kernel will try to allocate
+ memory used by a CPU on the local memory controller of the CPU
+ and add some more NUMA awareness to the kernel.
+ This code is recommended on all multiprocessor Opteron systems
+ and normally doesn't hurt on others.
+
+config NUMA_EMU
+ bool "NUMA emulation support"
+ select NUMA
+ depends on SMP
+ help
+ Enable NUMA emulation. A flat machine will be split
+ into virtual nodes when booted with "numa=fake=N", where N is the
+ number of nodes. This is only useful for debugging.
+
+config DISCONTIGMEM
+ bool
+ depends on NUMA
+ default y
+
+config NUMA
+ bool
+ default n
+
+config HAVE_DEC_LOCK
+ bool
+ depends on SMP
+ default y
+
+config NR_CPUS
+ int "Maximum number of CPUs (2-256)"
+ range 2 256
+ depends on SMP
+ default "8"
+ help
+ This allows you to specify the maximum number of CPUs which this
+ kernel will support. Current maximum is 256 CPUs due to
+ APIC addressing limits. Less depending on the hardware.
+
+ This is purely to save memory - each supported CPU requires
+ memory in the static kernel configuration.
+
+config HPET_TIMER
+ bool
+ default y
+ help
+ Use the IA-PC HPET (High Precision Event Timer) to manage
+ time in preference to the PIT and RTC, if a HPET is
+ present. The HPET provides a stable time base on SMP
+ systems, unlike the TSC, but it is more expensive to access,
+ as it is off-chip. You can find the HPET spec at
+ <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>.
+
+config HPET_EMULATE_RTC
+ bool "Provide RTC interrupt"
+ depends on HPET_TIMER && RTC=y
+
+config GART_IOMMU
+ bool "IOMMU support"
+ depends on PCI
+ help
+ Support the K8 IOMMU. Needed to run systems with more than 4GB of memory
+ properly with 32-bit PCI devices that do not support DAC (Double Address
+ Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter.
+ Normally the kernel will take the right choice by itself.
+ If unsure, say Y.
+
+# need this always enabled with GART_IOMMU for the VIA workaround
+config SWIOTLB
+ bool
+ depends on GART_IOMMU
+ default y
+
+config DUMMY_IOMMU
+ bool
+ depends on !GART_IOMMU && !SWIOTLB
+ default y
+ help
+ Don't use IOMMU code. This will cause problems when you have more than 4GB
+ of memory and any 32-bit devices. Don't turn on unless you know what you
+ are doing.
+
+config X86_MCE
+ bool "Machine check support" if EMBEDDED
+ default y
+ help
+ Include a machine check error handler to report hardware errors.
+ This version will require the mcelog utility to decode some
+ machine check error logs. See
+ ftp://ftp.x86-64.org/pub/linux/tools/mcelog
+
+config X86_MCE_INTEL
+ bool "Intel MCE features"
+ depends on X86_MCE && X86_LOCAL_APIC
+ default y
+ help
+ Additional support for intel specific MCE features such as
+ the thermal monitor.
+
+config SECCOMP
+ bool "Enable seccomp to safely compute untrusted bytecode"
+ depends on PROC_FS
+ default y
+ help
+ This kernel feature is useful for number crunching applications
+ that may need to compute untrusted bytecode during their
+ execution. By using pipes or other transports made available to
+ the process as file descriptors supporting the read/write
+ syscalls, it's possible to isolate those applications in
+ their own address space using seccomp. Once seccomp is
+ enabled via /proc/<pid>/seccomp, it cannot be disabled
+ and the task is only allowed to execute a few safe syscalls
+ defined by each seccomp mode.
+
+ If unsure, say Y. Only embedded should say N here.
+
+endmenu
+
+#
+# Use the generic interrupt handling code in kernel/irq/:
+#
+config GENERIC_HARDIRQS
+ bool
+ default y
+
+config GENERIC_IRQ_PROBE
+ bool
+ default y
+
+menu "Power management options"
+
+source kernel/power/Kconfig
+
+source "drivers/acpi/Kconfig"
+
+source "arch/x86_64/kernel/cpufreq/Kconfig"
+
+endmenu
+
+menu "Bus options (PCI etc.)"
+
+config PCI
+ bool "PCI support"
+
+# x86-64 doesn't support PCI BIOS access from long mode so always go direct.
+config PCI_DIRECT
+ bool
+ depends on PCI
+ default y
+
+config PCI_MMCONFIG
+ bool "Support mmconfig PCI config space access"
+ depends on PCI
+ select ACPI_BOOT
+
+config UNORDERED_IO
+ bool "Unordered IO mapping access"
+ depends on EXPERIMENTAL
+ help
+ Use unordered stores to access IO memory mappings in device drivers.
+ Still very experimental. When a driver works on IA64/ppc64/pa-risc it should
+ work with this option, but it makes the drivers behave differently
+ from i386. Requires that the driver writer used memory barriers
+ properly.
+
+source "drivers/pci/pcie/Kconfig"
+
+source "drivers/pci/Kconfig"
+
+source "drivers/pcmcia/Kconfig"
+
+source "drivers/pci/hotplug/Kconfig"
+
+endmenu
+
+
+menu "Executable file formats / Emulations"
+
+source "fs/Kconfig.binfmt"
+
+config IA32_EMULATION
+ bool "IA32 Emulation"
+ help
+ Include code to run 32-bit programs under a 64-bit kernel. You should likely
+ turn this on, unless you're 100% sure that you don't have any 32-bit programs
+ left.
+
+config IA32_AOUT
+ bool "IA32 a.out support"
+ depends on IA32_EMULATION
+ help
+ Support old a.out binaries in the 32bit emulation.
+
+config COMPAT
+ bool
+ depends on IA32_EMULATION
+ default y
+
+config SYSVIPC_COMPAT
+ bool
+ depends on COMPAT && SYSVIPC
+ default y
+
+config UID16
+ bool
+ depends on IA32_EMULATION
+ default y
+
+endmenu
+
+source drivers/Kconfig
+
+source "drivers/firmware/Kconfig"
+
+source fs/Kconfig
+
+source "arch/x86_64/oprofile/Kconfig"
+
+source "arch/x86_64/Kconfig.debug"
+
+source "security/Kconfig"
+
+source "crypto/Kconfig"
+
+source "lib/Kconfig"
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
new file mode 100644
index 000000000000..9cf1410d2f5a
--- /dev/null
+++ b/arch/x86_64/Kconfig.debug
@@ -0,0 +1,57 @@
+menu "Kernel hacking"
+
+source "lib/Kconfig.debug"
+
+# !SMP for now because the context switch early causes GPF in segment reloading
+# and the GS base checking does the wrong thing then, causing a hang.
+config CHECKING
+ bool "Additional run-time checks"
+ depends on DEBUG_KERNEL && !SMP
+ help
+ Enables some internal consistency checks for kernel debugging.
+ You should normally say N.
+
+config INIT_DEBUG
+ bool "Debug __init statements"
+ depends on DEBUG_KERNEL
+ help
+ Fill __init and __initdata at the end of boot. This helps debugging
+ illegal uses of __init and __initdata after initialization.
+
+config IOMMU_DEBUG
+ depends on GART_IOMMU && DEBUG_KERNEL
+ bool "Enable IOMMU debugging"
+ help
+ Force the IOMMU to on even when you have less than 4GB of
+ memory and add debugging code. On overflow always panic. And
+ allow to enable IOMMU leak tracing. Can be disabled at boot
+ time with iommu=noforce. This will also enable scatter gather
+ list merging. Currently not recommended for production
+ code. When you use it make sure you have a big enough
+ IOMMU/AGP aperture. Most of the options enabled by this can
+ be set more finegrained using the iommu= command line
+ options. See Documentation/x86_64/boot-options.txt for more
+ details.
+
+config KPROBES
+ bool "Kprobes"
+ depends on DEBUG_KERNEL
+ help
+ Kprobes allows you to trap at almost any kernel address and
+ execute a callback function. register_kprobe() establishes
+ a probepoint and specifies the callback. Kprobes is useful
+ for kernel debugging, non-intrusive instrumentation and testing.
+ If in doubt, say "N".
+
+config IOMMU_LEAK
+ bool "IOMMU leak tracing"
+ depends on DEBUG_KERNEL
+ depends on IOMMU_DEBUG
+ help
+ Add a simple leak tracer to the IOMMU code. This is useful when you
+ are debugging a buggy device driver that leaks IOMMU mappings.
+
+#config X86_REMOTE_DEBUG
+# bool "kgdb debugging stub"
+
+endmenu
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
new file mode 100644
index 000000000000..6f90c246c418
--- /dev/null
+++ b/arch/x86_64/Makefile
@@ -0,0 +1,119 @@
+#
+# x86_64/Makefile
+#
+# This file is included by the global makefile so that you can add your own
+# architecture-specific flags and dependencies. Remember to do have actions
+# for "archclean" and "archdep" for cleaning up and making dependencies for
+# this architecture
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1994 by Linus Torvalds
+#
+# 19990713 Artur Skawina <skawina@geocities.com>
+# Added '-march' and '-mpreferred-stack-boundary' support
+# 20000913 Pavel Machek <pavel@suse.cz>
+# Converted for x86_64 architecture
+# 20010105 Andi Kleen, add IA32 compiler.
+# ....and later removed it again....
+#
+# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $
+
+#
+# early bootup linking needs 32bit. You can either use real 32bit tools
+# here or 64bit tools in 32bit mode.
+#
+IA32_CC := $(CC) $(CPPFLAGS) -m32 -O2 -fomit-frame-pointer
+IA32_LD := $(LD) -m elf_i386
+IA32_AS := $(CC) $(AFLAGS) -m32 -Wa,--32 -traditional -c
+IA32_OBJCOPY := $(CROSS_COMPILE)objcopy
+IA32_CPP := $(CROSS_COMPILE)gcc -m32 -E
+export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP
+
+
+LDFLAGS := -m elf_x86_64
+OBJCOPYFLAGS := -O binary -R .note -R .comment -S
+LDFLAGS_vmlinux := -e stext
+
+CHECKFLAGS += -D__x86_64__ -m64
+
+cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
+cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
+CFLAGS += $(cflags-y)
+
+CFLAGS += -mno-red-zone
+CFLAGS += -mcmodel=kernel
+CFLAGS += -pipe
+# this makes reading assembly source easier, but produces worse code
+# actually it makes the kernel smaller too.
+CFLAGS += -fno-reorder-blocks
+CFLAGS += -Wno-sign-compare
+ifneq ($(CONFIG_DEBUG_INFO),y)
+CFLAGS += -fno-asynchronous-unwind-tables
+# -fweb shrinks the kernel a bit, but the difference is very small
+# it also messes up debugging, so don't use it for now.
+#CFLAGS += $(call cc-option,-fweb)
+endif
+# -funit-at-a-time shrinks the kernel .text considerably
+# unfortunately it makes reading oopses harder.
+CFLAGS += $(call cc-option,-funit-at-a-time)
+# prevent gcc from generating any FP code by mistake
+CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+
+head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
+
+libs-y += arch/x86_64/lib/
+core-y += arch/x86_64/kernel/ arch/x86_64/mm/
+core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/
+drivers-$(CONFIG_PCI) += arch/x86_64/pci/
+drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/
+
+boot := arch/x86_64/boot
+
+.PHONY: bzImage bzlilo install archmrproper \
+ fdimage fdimage144 fdimage288 archclean
+
+#Default target when executing "make"
+all: bzImage
+
+BOOTIMAGE := arch/x86_64/boot/bzImage
+KBUILD_IMAGE := $(BOOTIMAGE)
+
+bzImage: vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE)
+
+bzlilo: vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo
+
+bzdisk: vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk
+
+install fdimage fdimage144 fdimage288: vmlinux
+ $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@
+
+archclean:
+ $(Q)$(MAKE) $(clean)=$(boot)
+
+prepare: include/asm-$(ARCH)/offset.h
+
+arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \
+ include/config/MARKER
+
+include/asm-$(ARCH)/offset.h: arch/$(ARCH)/kernel/asm-offsets.s
+ $(call filechk,gen-asm-offsets)
+
+CLEAN_FILES += include/asm-$(ARCH)/offset.h
+
+define archhelp
+ echo '* bzImage - Compressed kernel image (arch/$(ARCH)/boot/bzImage)'
+ echo ' install - Install kernel using'
+ echo ' (your) ~/bin/installkernel or'
+ echo ' (distribution) /sbin/installkernel or'
+ echo ' install to $$(INSTALL_PATH) and run lilo'
+endef
+
+CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
+
+
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile
new file mode 100644
index 000000000000..f4399c701b77
--- /dev/null
+++ b/arch/x86_64/boot/Makefile
@@ -0,0 +1,102 @@
+#
+# arch/x86_64/boot/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1994 by Linus Torvalds
+#
+
+# ROOT_DEV specifies the default root-device when making the image.
+# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
+# the default of FLOPPY is used by 'build'.
+
+ROOT_DEV := CURRENT
+
+# If you want to preset the SVGA mode, uncomment the next line and
+# set SVGA_MODE to whatever number you want.
+# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
+# The number is the same as you would ordinarily press at bootup.
+
+SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
+
+# If you want the RAM disk device, define this to be the size in blocks.
+
+#RAMDISK := -DRAMDISK=512
+
+targets := vmlinux.bin bootsect bootsect.o \
+ setup setup.o bzImage mtools.conf
+
+EXTRA_CFLAGS := -m32
+
+hostprogs-y := tools/build
+HOST_EXTRACFLAGS += $(LINUXINCLUDE)
+subdir- := compressed/ #Let make clean descend in compressed/
+# ---------------------------------------------------------------------------
+
+$(obj)/bzImage: IMAGE_OFFSET := 0x100000
+$(obj)/bzImage: EXTRA_AFLAGS := -traditional $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
+$(obj)/bzImage: BUILDFLAGS := -b
+
+quiet_cmd_image = BUILD $@
+cmd_image = $(obj)/tools/build $(BUILDFLAGS) $(obj)/bootsect $(obj)/setup \
+ $(obj)/vmlinux.bin $(ROOT_DEV) > $@
+
+$(obj)/bzImage: $(obj)/bootsect $(obj)/setup \
+ $(obj)/vmlinux.bin $(obj)/tools/build FORCE
+ $(call if_changed,image)
+ @echo 'Kernel: $@ is ready'
+
+$(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
+ $(call if_changed,objcopy)
+
+LDFLAGS_bootsect := -Ttext 0x0 -s --oformat binary
+LDFLAGS_setup := -Ttext 0x0 -s --oformat binary -e begtext
+
+$(obj)/setup $(obj)/bootsect: %: %.o FORCE
+ $(call if_changed,ld)
+
+$(obj)/compressed/vmlinux: FORCE
+ $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@
+
+# Set this if you want to pass append arguments to the zdisk/fdimage kernel
+FDARGS =
+
+$(obj)/mtools.conf: $(src)/mtools.conf.in
+ sed -e 's|@OBJ@|$(obj)|g' < $< > $@
+
+# This requires write access to /dev/fd0
+zdisk: $(BOOTIMAGE) $(obj)/mtools.conf
+ MTOOLSRC=$(obj)/mtools.conf mformat a: ; sync
+ syslinux /dev/fd0 ; sync
+ echo 'default linux $(FDARGS)' | \
+ MTOOLSRC=$(obj)/mtools.conf mcopy - a:syslinux.cfg
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) a:linux ; sync
+
+# These require being root or having syslinux 2.02 or higher installed
+fdimage fdimage144: $(BOOTIMAGE) $(obj)/mtools.conf
+ dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
+ MTOOLSRC=$(obj)/mtools.conf mformat v: ; sync
+ syslinux $(obj)/fdimage ; sync
+ echo 'default linux $(FDARGS)' | \
+ MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) v:linux ; sync
+
+fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
+ dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
+ MTOOLSRC=$(obj)/mtools.conf mformat w: ; sync
+ syslinux $(obj)/fdimage ; sync
+ echo 'default linux $(FDARGS)' | \
+ MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
+ MTOOLSRC=$(obj)/mtools.conf mcopy $(BOOTIMAGE) w:linux ; sync
+
+zlilo: $(BOOTIMAGE)
+ if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
+ if [ -f $(INSTALL_PATH)/System.map ]; then mv $(INSTALL_PATH)/System.map $(INSTALL_PATH)/System.old; fi
+ cat $(BOOTIMAGE) > $(INSTALL_PATH)/vmlinuz
+ cp System.map $(INSTALL_PATH)/
+ if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
+
+install: $(BOOTIMAGE)
+ sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(BOOTIMAGE) System.map "$(INSTALL_PATH)"
diff --git a/arch/x86_64/boot/bootsect.S b/arch/x86_64/boot/bootsect.S
new file mode 100644
index 000000000000..bb15d406ee95
--- /dev/null
+++ b/arch/x86_64/boot/bootsect.S
@@ -0,0 +1,98 @@
+/*
+ * bootsect.S Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * modified by Drew Eckhardt
+ * modified by Bruce Evans (bde)
+ * modified by Chris Noe (May 1999) (as86 -> gas)
+ * gutted by H. Peter Anvin (Jan 2003)
+ *
+ * BIG FAT NOTE: We're in real mode using 64k segments. Therefore segment
+ * addresses must be multiplied by 16 to obtain their respective linear
+ * addresses. To avoid confusion, linear addresses are written using leading
+ * hex while segment addresses are written as segment:offset.
+ *
+ */
+
+#include <asm/boot.h>
+
+SETUPSECTS = 4 /* default nr of setup-sectors */
+BOOTSEG = 0x07C0 /* original address of boot-sector */
+INITSEG = DEF_INITSEG /* we move boot here - out of the way */
+SETUPSEG = DEF_SETUPSEG /* setup starts here */
+SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
+SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
+ /* to be loaded */
+ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
+SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
+
+#ifndef SVGA_MODE
+#define SVGA_MODE ASK_VGA
+#endif
+
+#ifndef RAMDISK
+#define RAMDISK 0
+#endif
+
+#ifndef ROOT_RDONLY
+#define ROOT_RDONLY 1
+#endif
+
+.code16
+.text
+
+.global _start
+_start:
+
+ # Normalize the start address
+ jmpl $BOOTSEG, $start2
+
+start2:
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %ss
+ movw $0x7c00, %sp
+ sti
+ cld
+
+ movw $bugger_off_msg, %si
+
+msg_loop:
+ lodsb
+ andb %al, %al
+ jz die
+ movb $0xe, %ah
+ movw $7, %bx
+ int $0x10
+ jmp msg_loop
+
+die:
+ # Allow the user to press a key, then reboot
+ xorw %ax, %ax
+ int $0x16
+ int $0x19
+
+ # int 0x19 should never return. In case it does anyway,
+ # invoke the BIOS reset code...
+ ljmp $0xf000,$0xfff0
+
+
+bugger_off_msg:
+ .ascii "Direct booting from floppy is no longer supported.\r\n"
+ .ascii "Please use a boot loader program instead.\r\n"
+ .ascii "\n"
+ .ascii "Remove disk and press any key to reboot . . .\r\n"
+ .byte 0
+
+
+ # Kernel attributes; used by setup
+
+ .org 497
+setup_sects: .byte SETUPSECTS
+root_flags: .word ROOT_RDONLY
+syssize: .word SYSSIZE
+swap_dev: .word SWAP_DEV
+ram_size: .word RAMDISK
+vid_mode: .word SVGA_MODE
+root_dev: .word ROOT_DEV
+boot_flag: .word 0xAA55
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
new file mode 100644
index 000000000000..f89d96f11a9f
--- /dev/null
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -0,0 +1,32 @@
+#
+# linux/arch/x86_64/boot/compressed/Makefile
+#
+# create a compressed vmlinux image from the original vmlinux
+#
+# Note all the files here are compiled/linked as 32bit executables.
+#
+
+targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
+EXTRA_AFLAGS := -traditional -m32
+
+# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
+# -m32
+CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing
+LDFLAGS := -m elf_i386
+
+LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386
+
+$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
+ $(call if_changed,ld)
+ @:
+
+$(obj)/vmlinux.bin: vmlinux FORCE
+ $(call if_changed,objcopy)
+
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+ $(call if_changed,gzip)
+
+LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
+
+$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
+ $(call if_changed,ld)
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
new file mode 100644
index 000000000000..27264dbd575c
--- /dev/null
+++ b/arch/x86_64/boot/compressed/head.S
@@ -0,0 +1,142 @@
+/*
+ * linux/boot/head.S
+ *
+ * Copyright (C) 1991, 1992, 1993 Linus Torvalds
+ *
+ * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $
+ */
+
+/*
+ * head.S contains the 32-bit startup code.
+ *
+ * NOTE!!! Startup happens at absolute address 0x00001000, which is also where
+ * the page directory will exist. The startup code will be overwritten by
+ * the page directory. [According to comments etc elsewhere on a compressed
+ * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
+ *
+ * Page 0 is deliberately kept safe, since System Management Mode code in
+ * laptops may need to access the BIOS data stored there. This is also
+ * useful for future device drivers that either access the BIOS via VM86
+ * mode.
+ */
+
+/*
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+.code32
+.text
+
+#include <linux/linkage.h>
+#include <asm/segment.h>
+
+ .code32
+ .globl startup_32
+
+startup_32:
+ cld
+ cli
+ movl $(__KERNEL_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ movl %eax,%fs
+ movl %eax,%gs
+
+ lss stack_start,%esp
+ xorl %eax,%eax
+1: incl %eax # check that A20 really IS enabled
+ movl %eax,0x000000 # loop forever if it isn't
+ cmpl %eax,0x100000
+ je 1b
+
+/*
+ * Initialize eflags. Some BIOS's leave bits like NT set. This would
+ * confuse the debugger if this code is traced.
+ * XXX - best to initialize before switching to protected mode.
+ */
+ pushl $0
+ popfl
+/*
+ * Clear BSS
+ */
+ xorl %eax,%eax
+ movl $_edata,%edi
+ movl $_end,%ecx
+ subl %edi,%ecx
+ cld
+ rep
+ stosb
+/*
+ * Do the decompression, and jump to the new kernel..
+ */
+ subl $16,%esp # place for structure on the stack
+ movl %esp,%eax
+ pushl %esi # real mode pointer as second arg
+ pushl %eax # address of structure as first arg
+ call decompress_kernel
+ orl %eax,%eax
+ jnz 3f
+ addl $8,%esp
+ xorl %ebx,%ebx
+ ljmp $(__KERNEL_CS), $0x100000
+
+/*
+ * We come here, if we were loaded high.
+ * We need to move the move-in-place routine down to 0x1000
+ * and then start it with the buffer addresses in registers,
+ * which we got from the stack.
+ */
+3:
+ movl %esi,%ebx
+ movl $move_routine_start,%esi
+ movl $0x1000,%edi
+ movl $move_routine_end,%ecx
+ subl %esi,%ecx
+ addl $3,%ecx
+ shrl $2,%ecx
+ cld
+ rep
+ movsl
+
+ popl %esi # discard the address
+ addl $4,%esp # real mode pointer
+ popl %esi # low_buffer_start
+ popl %ecx # lcount
+ popl %edx # high_buffer_start
+ popl %eax # hcount
+ movl $0x100000,%edi
+ cli # make sure we don't get interrupted
+ ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
+
+/*
+ * Routine (template) for moving the decompressed kernel in place,
+ * if we were high loaded. This _must_ PIC-code !
+ */
+move_routine_start:
+ movl %ecx,%ebp
+ shrl $2,%ecx
+ rep
+ movsl
+ movl %ebp,%ecx
+ andl $3,%ecx
+ rep
+ movsb
+ movl %edx,%esi
+ movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0
+ addl $3,%ecx
+ shrl $2,%ecx
+ rep
+ movsl
+ movl %ebx,%esi # Restore setup pointer
+ xorl %ebx,%ebx
+ ljmp $(__KERNEL_CS), $0x100000
+move_routine_end:
+
+
+/* Stack for uncompression */
+ .align 32
+user_stack:
+ .fill 4096,4,0
+stack_start:
+ .long user_stack+4096
+ .word __KERNEL_DS
+
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
new file mode 100644
index 000000000000..c8b9216f9e63
--- /dev/null
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -0,0 +1,354 @@
+/*
+ * misc.c
+ *
+ * This is a collection of several routines from gzip-1.0.3
+ * adapted for Linux.
+ *
+ * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994
+ * puts by Nick Holloway 1993, better puts by Martin Mares 1995
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ */
+
+#include "miscsetup.h"
+#include <asm/io.h>
+
+/*
+ * gzip declarations
+ */
+
+#define OF(args) args
+#define STATIC static
+
+#undef memset
+#undef memcpy
+#define memzero(s, n) memset ((s), 0, (n))
+
+typedef unsigned char uch;
+typedef unsigned short ush;
+typedef unsigned long ulg;
+
+#define WSIZE 0x8000 /* Window size must be at least 32k, */
+ /* and a power of two */
+
+static uch *inbuf; /* input buffer */
+static uch window[WSIZE]; /* Sliding window buffer */
+
+static unsigned insize = 0; /* valid bytes in inbuf */
+static unsigned inptr = 0; /* index of next byte to be processed in inbuf */
+static unsigned outcnt = 0; /* bytes in output buffer */
+
+/* gzip flag byte */
+#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
+#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
+#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
+#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
+#define COMMENT 0x10 /* bit 4 set: file comment present */
+#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
+#define RESERVED 0xC0 /* bit 6,7: reserved */
+
+#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
+
+/* Diagnostic functions */
+#ifdef DEBUG
+# define Assert(cond,msg) {if(!(cond)) error(msg);}
+# define Trace(x) fprintf x
+# define Tracev(x) {if (verbose) fprintf x ;}
+# define Tracevv(x) {if (verbose>1) fprintf x ;}
+# define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
+# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
+#else
+# define Assert(cond,msg)
+# define Trace(x)
+# define Tracev(x)
+# define Tracevv(x)
+# define Tracec(c,x)
+# define Tracecv(c,x)
+#endif
+
+static int fill_inbuf(void);
+static void flush_window(void);
+static void error(char *m);
+static void gzip_mark(void **);
+static void gzip_release(void **);
+
+/*
+ * This is set up by the setup-routine at boot-time
+ */
+static unsigned char *real_mode; /* Pointer to real-mode data */
+
+#define EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
+#ifndef STANDARD_MEMORY_BIOS_CALL
+#define ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
+#endif
+#define SCREEN_INFO (*(struct screen_info *)(real_mode+0))
+
+extern char input_data[];
+extern int input_len;
+
+static long bytes_out = 0;
+static uch *output_data;
+static unsigned long output_ptr = 0;
+
+static void *malloc(int size);
+static void free(void *where);
+
+static void putstr(const char *);
+
+extern int end;
+static long free_mem_ptr = (long)&end;
+static long free_mem_end_ptr;
+
+#define INPLACE_MOVE_ROUTINE 0x1000
+#define LOW_BUFFER_START 0x2000
+#define LOW_BUFFER_MAX 0x90000
+#define HEAP_SIZE 0x3000
+static unsigned int low_buffer_end, low_buffer_size;
+static int high_loaded =0;
+static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
+
+static char *vidmem = (char *)0xb8000;
+static int vidport;
+static int lines, cols;
+
+#include "../../../../lib/inflate.c"
+
+static void *malloc(int size)
+{
+ void *p;
+
+ if (size <0) error("Malloc error");
+ if (free_mem_ptr <= 0) error("Memory error");
+
+ free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
+
+ p = (void *)free_mem_ptr;
+ free_mem_ptr += size;
+
+ if (free_mem_ptr >= free_mem_end_ptr)
+ error("Out of memory");
+
+ return p;
+}
+
+static void free(void *where)
+{ /* Don't care */
+}
+
+static void gzip_mark(void **ptr)
+{
+ *ptr = (void *) free_mem_ptr;
+}
+
+static void gzip_release(void **ptr)
+{
+ free_mem_ptr = (long) *ptr;
+}
+
+static void scroll(void)
+{
+ int i;
+
+ memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
+ for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
+ vidmem[i] = ' ';
+}
+
+static void putstr(const char *s)
+{
+ int x,y,pos;
+ char c;
+
+ x = SCREEN_INFO.orig_x;
+ y = SCREEN_INFO.orig_y;
+
+ while ( ( c = *s++ ) != '\0' ) {
+ if ( c == '\n' ) {
+ x = 0;
+ if ( ++y >= lines ) {
+ scroll();
+ y--;
+ }
+ } else {
+ vidmem [ ( x + cols * y ) * 2 ] = c;
+ if ( ++x >= cols ) {
+ x = 0;
+ if ( ++y >= lines ) {
+ scroll();
+ y--;
+ }
+ }
+ }
+ }
+
+ SCREEN_INFO.orig_x = x;
+ SCREEN_INFO.orig_y = y;
+
+ pos = (x + cols * y) * 2; /* Update cursor position */
+ outb_p(14, vidport);
+ outb_p(0xff & (pos >> 9), vidport+1);
+ outb_p(15, vidport);
+ outb_p(0xff & (pos >> 1), vidport+1);
+}
+
+void* memset(void* s, int c, unsigned n)
+{
+ int i;
+ char *ss = (char*)s;
+
+ for (i=0;i<n;i++) ss[i] = c;
+ return s;
+}
+
+void* memcpy(void* dest, const void* src, unsigned n)
+{
+ int i;
+ char *d = (char *)dest, *s = (char *)src;
+
+ for (i=0;i<n;i++) d[i] = s[i];
+ return dest;
+}
+
+/* ===========================================================================
+ * Fill the input buffer. This is called only when the buffer is empty
+ * and at least one byte is really needed.
+ */
+static int fill_inbuf(void)
+{
+ if (insize != 0) {
+ error("ran out of input data");
+ }
+
+ inbuf = input_data;
+ insize = input_len;
+ inptr = 1;
+ return inbuf[0];
+}
+
+/* ===========================================================================
+ * Write the output window window[0..outcnt-1] and update crc and bytes_out.
+ * (Used for the decompressed data only.)
+ */
+static void flush_window_low(void)
+{
+ ulg c = crc; /* temporary variable */
+ unsigned n;
+ uch *in, *out, ch;
+
+ in = window;
+ out = &output_data[output_ptr];
+ for (n = 0; n < outcnt; n++) {
+ ch = *out++ = *in++;
+ c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+ }
+ crc = c;
+ bytes_out += (ulg)outcnt;
+ output_ptr += (ulg)outcnt;
+ outcnt = 0;
+}
+
+static void flush_window_high(void)
+{
+ ulg c = crc; /* temporary variable */
+ unsigned n;
+ uch *in, ch;
+ in = window;
+ for (n = 0; n < outcnt; n++) {
+ ch = *output_data++ = *in++;
+ if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start;
+ c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
+ }
+ crc = c;
+ bytes_out += (ulg)outcnt;
+ outcnt = 0;
+}
+
+static void flush_window(void)
+{
+ if (high_loaded) flush_window_high();
+ else flush_window_low();
+}
+
+static void error(char *x)
+{
+ putstr("\n\n");
+ putstr(x);
+ putstr("\n\n -- System halted");
+
+ while(1);
+}
+
+void setup_normal_output_buffer(void)
+{
+#ifdef STANDARD_MEMORY_BIOS_CALL
+ if (EXT_MEM_K < 1024) error("Less than 2MB of memory");
+#else
+ if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
+#endif
+ output_data = (char *)0x100000; /* Points to 1M */
+ free_mem_end_ptr = (long)real_mode;
+}
+
+struct moveparams {
+ uch *low_buffer_start; int lcount;
+ uch *high_buffer_start; int hcount;
+};
+
+void setup_output_buffer_if_we_run_high(struct moveparams *mv)
+{
+ high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
+#ifdef STANDARD_MEMORY_BIOS_CALL
+ if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
+#else
+ if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
+#endif
+ mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START;
+ low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
+ ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;
+ low_buffer_size = low_buffer_end - LOW_BUFFER_START;
+ high_loaded = 1;
+ free_mem_end_ptr = (long)high_buffer_start;
+ if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) {
+ high_buffer_start = (uch *)(0x100000 + low_buffer_size);
+ mv->hcount = 0; /* say: we need not to move high_buffer */
+ }
+ else mv->hcount = -1;
+ mv->high_buffer_start = high_buffer_start;
+}
+
+void close_output_buffer_if_we_run_high(struct moveparams *mv)
+{
+ if (bytes_out > low_buffer_size) {
+ mv->lcount = low_buffer_size;
+ if (mv->hcount)
+ mv->hcount = bytes_out - low_buffer_size;
+ } else {
+ mv->lcount = bytes_out;
+ mv->hcount = 0;
+ }
+}
+
+int decompress_kernel(struct moveparams *mv, void *rmode)
+{
+ real_mode = rmode;
+
+ if (SCREEN_INFO.orig_video_mode == 7) {
+ vidmem = (char *) 0xb0000;
+ vidport = 0x3b4;
+ } else {
+ vidmem = (char *) 0xb8000;
+ vidport = 0x3d4;
+ }
+
+ lines = SCREEN_INFO.orig_video_lines;
+ cols = SCREEN_INFO.orig_video_cols;
+
+ if (free_mem_ptr < 0x100000) setup_normal_output_buffer();
+ else setup_output_buffer_if_we_run_high(mv);
+
+ makecrc();
+ putstr(".\nDecompressing Linux...");
+ gunzip();
+ putstr("done.\nBooting the kernel.\n");
+ if (high_loaded) close_output_buffer_if_we_run_high(mv);
+ return high_loaded;
+}
diff --git a/arch/x86_64/boot/compressed/miscsetup.h b/arch/x86_64/boot/compressed/miscsetup.h
new file mode 100644
index 000000000000..bb1620531703
--- /dev/null
+++ b/arch/x86_64/boot/compressed/miscsetup.h
@@ -0,0 +1,39 @@
+#define NULL 0
+//typedef unsigned int size_t;
+
+
+struct screen_info {
+ unsigned char orig_x; /* 0x00 */
+ unsigned char orig_y; /* 0x01 */
+ unsigned short dontuse1; /* 0x02 -- EXT_MEM_K sits here */
+ unsigned short orig_video_page; /* 0x04 */
+ unsigned char orig_video_mode; /* 0x06 */
+ unsigned char orig_video_cols; /* 0x07 */
+ unsigned short unused2; /* 0x08 */
+ unsigned short orig_video_ega_bx; /* 0x0a */
+ unsigned short unused3; /* 0x0c */
+ unsigned char orig_video_lines; /* 0x0e */
+ unsigned char orig_video_isVGA; /* 0x0f */
+ unsigned short orig_video_points; /* 0x10 */
+
+ /* VESA graphic mode -- linear frame buffer */
+ unsigned short lfb_width; /* 0x12 */
+ unsigned short lfb_height; /* 0x14 */
+ unsigned short lfb_depth; /* 0x16 */
+ unsigned long lfb_base; /* 0x18 */
+ unsigned long lfb_size; /* 0x1c */
+ unsigned short dontuse2, dontuse3; /* 0x20 -- CL_MAGIC and CL_OFFSET here */
+ unsigned short lfb_linelength; /* 0x24 */
+ unsigned char red_size; /* 0x26 */
+ unsigned char red_pos; /* 0x27 */
+ unsigned char green_size; /* 0x28 */
+ unsigned char green_pos; /* 0x29 */
+ unsigned char blue_size; /* 0x2a */
+ unsigned char blue_pos; /* 0x2b */
+ unsigned char rsvd_size; /* 0x2c */
+ unsigned char rsvd_pos; /* 0x2d */
+ unsigned short vesapm_seg; /* 0x2e */
+ unsigned short vesapm_off; /* 0x30 */
+ unsigned short pages; /* 0x32 */
+ /* 0x34 -- 0x3f reserved for future expansion */
+};
diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr
new file mode 100644
index 000000000000..1ed9d791f863
--- /dev/null
+++ b/arch/x86_64/boot/compressed/vmlinux.scr
@@ -0,0 +1,9 @@
+SECTIONS
+{
+ .data : {
+ input_len = .;
+ LONG(input_data_end - input_data) input_data = .;
+ *(.data)
+ input_data_end = .;
+ }
+}
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
new file mode 100644
index 000000000000..90f2452b3b9e
--- /dev/null
+++ b/arch/x86_64/boot/install.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+#
+# arch/i386/boot/install.sh
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1995 by Linus Torvalds
+#
+# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
+#
+# "make install" script for i386 architecture
+#
+# Arguments:
+# $1 - kernel version
+# $2 - kernel image file
+# $3 - kernel map file
+# $4 - default install path (blank if root directory)
+#
+
+# User may have a custom install script
+
+if [ -x ~/bin/installkernel ]; then exec ~/bin/installkernel "$@"; fi
+if [ -x /sbin/installkernel ]; then exec /sbin/installkernel "$@"; fi
+
+# Default install - same as make zlilo
+
+if [ -f $4/vmlinuz ]; then
+ mv $4/vmlinuz $4/vmlinuz.old
+fi
+
+if [ -f $4/System.map ]; then
+ mv $4/System.map $4/System.old
+fi
+
+cat $2 > $4/vmlinuz
+cp $3 $4/System.map
+
+if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi
diff --git a/arch/x86_64/boot/mtools.conf.in b/arch/x86_64/boot/mtools.conf.in
new file mode 100644
index 000000000000..efd6d2490c1d
--- /dev/null
+++ b/arch/x86_64/boot/mtools.conf.in
@@ -0,0 +1,17 @@
+#
+# mtools configuration file for "make (b)zdisk"
+#
+
+# Actual floppy drive
+drive a:
+ file="/dev/fd0"
+
+# 1.44 MB floppy disk image
+drive v:
+ file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=18 filter
+
+# 2.88 MB floppy disk image (mostly for virtual uses)
+drive w:
+ file="@OBJ@/fdimage" cylinders=80 heads=2 sectors=36 filter
+
+
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
new file mode 100644
index 000000000000..3e838be9dbe7
--- /dev/null
+++ b/arch/x86_64/boot/setup.S
@@ -0,0 +1,867 @@
+/*
+ * setup.S Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * setup.s is responsible for getting the system data from the BIOS,
+ * and putting them into the appropriate places in system memory.
+ * both setup.s and system has been loaded by the bootblock.
+ *
+ * This code asks the bios for memory/disk/other parameters, and
+ * puts them in a "safe" place: 0x90000-0x901FF, ie where the
+ * boot-block used to be. It is then up to the protected mode
+ * system to read them from there before the area is overwritten
+ * for buffer-blocks.
+ *
+ * Move PS/2 aux init code to psaux.c
+ * (troyer@saifr00.cfsat.Honeywell.COM) 03Oct92
+ *
+ * some changes and additional features by Christoph Niemann,
+ * March 1993/June 1994 (Christoph.Niemann@linux.org)
+ *
+ * add APM BIOS checking by Stephen Rothwell, May 1994
+ * (sfr@canb.auug.org.au)
+ *
+ * High load stuff, initrd support and position independency
+ * by Hans Lermen & Werner Almesberger, February 1996
+ * <lermen@elserv.ffm.fgan.de>, <almesber@lrc.epfl.ch>
+ *
+ * Video handling moved to video.S by Martin Mares, March 1996
+ * <mj@k332.feld.cvut.cz>
+ *
+ * Extended memory detection scheme retwiddled by orc@pell.chi.il.us (david
+ * parsons) to avoid loadlin confusion, July 1997
+ *
+ * Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
+ * <stiker@northlink.com>
+ *
+ * Fix to work around buggy BIOSes which dont use carry bit correctly
+ * and/or report extended memory in CX/DX for e801h memory size detection
+ * call. As a result the kernel got wrong figures. The int15/e801h docs
+ * from Ralf Brown interrupt list seem to indicate AX/BX should be used
+ * anyway. So to avoid breaking many machines (presumably there was a reason
+ * to orginally use CX/DX instead of AX/BX), we do a kludge to see
+ * if CX/DX have been changed in the e801 call and if so use AX/BX .
+ * Michael Miller, April 2001 <michaelm@mjmm.org>
+ *
+ * Added long mode checking and SSE force. March 2003, Andi Kleen.
+ */
+
+#include <linux/config.h>
+#include <asm/segment.h>
+#include <linux/version.h>
+#include <linux/compile.h>
+#include <asm/boot.h>
+#include <asm/e820.h>
+#include <asm/page.h>
+
+/* Signature words to ensure LILO loaded us right */
+#define SIG1 0xAA55
+#define SIG2 0x5A5A
+
+INITSEG = DEF_INITSEG # 0x9000, we move boot here, out of the way
+SYSSEG = DEF_SYSSEG # 0x1000, system loaded at 0x10000 (65536).
+SETUPSEG = DEF_SETUPSEG # 0x9020, this is the current segment
+ # ... and the former contents of CS
+
+DELTA_INITSEG = SETUPSEG - INITSEG # 0x0020
+
+.code16
+.globl begtext, begdata, begbss, endtext, enddata, endbss
+
+.text
+begtext:
+.data
+begdata:
+.bss
+begbss:
+.text
+
+start:
+ jmp trampoline
+
+# This is the setup header, and it must start at %cs:2 (old 0x9020:2)
+
+ .ascii "HdrS" # header signature
+ .word 0x0203 # header version number (>= 0x0105)
+ # or else old loadlin-1.5 will fail)
+realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
+start_sys_seg: .word SYSSEG
+ .word kernel_version # pointing to kernel version string
+ # above section of header is compatible
+ # with loadlin-1.5 (header v1.5). Don't
+ # change it.
+
+type_of_loader: .byte 0 # = 0, old one (LILO, Loadlin,
+ # Bootlin, SYSLX, bootsect...)
+ # See Documentation/i386/boot.txt for
+ # assigned ids
+
+# flags, unused bits must be zero (RFU) bit within loadflags
+loadflags:
+LOADED_HIGH = 1 # If set, the kernel is loaded high
+CAN_USE_HEAP = 0x80 # If set, the loader also has set
+ # heap_end_ptr to tell how much
+ # space behind setup.S can be used for
+ # heap purposes.
+ # Only the loader knows what is free
+#ifndef __BIG_KERNEL__
+ .byte 0
+#else
+ .byte LOADED_HIGH
+#endif
+
+setup_move_size: .word 0x8000 # size to move, when setup is not
+ # loaded at 0x90000. We will move setup
+ # to 0x90000 then just before jumping
+ # into the kernel. However, only the
+ # loader knows how much data behind
+ # us also needs to be loaded.
+
+code32_start: # here loaders can put a different
+ # start address for 32-bit code.
+#ifndef __BIG_KERNEL__
+ .long 0x1000 # 0x1000 = default for zImage
+#else
+ .long 0x100000 # 0x100000 = default for big kernel
+#endif
+
+ramdisk_image: .long 0 # address of loaded ramdisk image
+ # Here the loader puts the 32-bit
+ # address where it loaded the image.
+ # This only will be read by the kernel.
+
+ramdisk_size: .long 0 # its size in bytes
+
+bootsect_kludge:
+ .long 0 # obsolete
+
+heap_end_ptr: .word modelist+1024 # (Header version 0x0201 or later)
+ # space from here (exclusive) down to
+ # end of setup code can be used by setup
+ # for local heap purposes.
+
+pad1: .word 0
+cmd_line_ptr: .long 0 # (Header version 0x0202 or later)
+ # If nonzero, a 32-bit pointer
+ # to the kernel command line.
+ # The command line should be
+ # located between the start of
+ # setup and the end of low
+ # memory (0xa0000), or it may
+ # get overwritten before it
+ # gets read. If this field is
+ # used, there is no longer
+ # anything magical about the
+ # 0x90000 segment; the setup
+ # can be located anywhere in
+ # low memory 0x10000 or higher.
+
+ramdisk_max: .long 0xffffffff
+
+trampoline: call start_of_setup
+ .align 16
+ # The offset at this point is 0x240
+ .space (0x7ff-0x240+1) # E820 & EDD space (ending at 0x7ff)
+# End of setup header #####################################################
+
+start_of_setup:
+# Bootlin depends on this being done early
+ movw $0x01500, %ax
+ movb $0x81, %dl
+ int $0x13
+
+#ifdef SAFE_RESET_DISK_CONTROLLER
+# Reset the disk controller.
+ movw $0x0000, %ax
+ movb $0x80, %dl
+ int $0x13
+#endif
+
+# Set %ds = %cs, we know that SETUPSEG = %cs at this point
+ movw %cs, %ax # aka SETUPSEG
+ movw %ax, %ds
+# Check signature at end of setup
+ cmpw $SIG1, setup_sig1
+ jne bad_sig
+
+ cmpw $SIG2, setup_sig2
+ jne bad_sig
+
+ jmp good_sig1
+
+# Routine to print asciiz string at ds:si
+prtstr:
+ lodsb
+ andb %al, %al
+ jz fin
+
+ call prtchr
+ jmp prtstr
+
+fin: ret
+
+# Space printing
+prtsp2: call prtspc # Print double space
+prtspc: movb $0x20, %al # Print single space (note: fall-thru)
+
+prtchr:
+ pushw %ax
+ pushw %cx
+ movw $0007,%bx
+ movw $0x01, %cx
+ movb $0x0e, %ah
+ int $0x10
+ popw %cx
+ popw %ax
+ ret
+
+beep: movb $0x07, %al
+ jmp prtchr
+
+no_sig_mess: .string "No setup signature found ..."
+
+good_sig1:
+ jmp good_sig
+
+# We now have to find the rest of the setup code/data
+bad_sig:
+ movw %cs, %ax # SETUPSEG
+ subw $DELTA_INITSEG, %ax # INITSEG
+ movw %ax, %ds
+ xorb %bh, %bh
+ movb (497), %bl # get setup sect from bootsect
+ subw $4, %bx # LILO loads 4 sectors of setup
+ shlw $8, %bx # convert to words (1sect=2^8 words)
+ movw %bx, %cx
+ shrw $3, %bx # convert to segment
+ addw $SYSSEG, %bx
+ movw %bx, %cs:start_sys_seg
+# Move rest of setup code/data to here
+ movw $2048, %di # four sectors loaded by LILO
+ subw %si, %si
+ movw %cs, %ax # aka SETUPSEG
+ movw %ax, %es
+ movw $SYSSEG, %ax
+ movw %ax, %ds
+ rep
+ movsw
+ movw %cs, %ax # aka SETUPSEG
+ movw %ax, %ds
+ cmpw $SIG1, setup_sig1
+ jne no_sig
+
+ cmpw $SIG2, setup_sig2
+ jne no_sig
+
+ jmp good_sig
+
+no_sig:
+ lea no_sig_mess, %si
+ call prtstr
+
+no_sig_loop:
+ jmp no_sig_loop
+
+good_sig:
+ movw %cs, %ax # aka SETUPSEG
+ subw $DELTA_INITSEG, %ax # aka INITSEG
+ movw %ax, %ds
+# Check if an old loader tries to load a big-kernel
+ testb $LOADED_HIGH, %cs:loadflags # Do we have a big kernel?
+ jz loader_ok # No, no danger for old loaders.
+
+ cmpb $0, %cs:type_of_loader # Do we have a loader that
+ # can deal with us?
+ jnz loader_ok # Yes, continue.
+
+ pushw %cs # No, we have an old loader,
+ popw %ds # die.
+ lea loader_panic_mess, %si
+ call prtstr
+
+ jmp no_sig_loop
+
+loader_panic_mess: .string "Wrong loader, giving up..."
+
+loader_ok:
+ /* check for long mode. */
+ /* we have to do this before the VESA setup, otherwise the user
+ can't see the error message. */
+
+ pushw %ds
+ movw %cs,%ax
+ movw %ax,%ds
+
+ /* minimum CPUID flags for x86-64 */
+ /* see http://www.x86-64.org/lists/discuss/msg02971.html */
+#define SSE_MASK ((1<<25)|(1<<26))
+#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\
+ (1<<13)|(1<<15)|(1<<24))
+#define REQUIRED_MASK2 (1<<29)
+
+ pushfl /* standard way to check for cpuid */
+ popl %eax
+ movl %eax,%ebx
+ xorl $0x200000,%eax
+ pushl %eax
+ popfl
+ pushfl
+ popl %eax
+ cmpl %eax,%ebx
+ jz no_longmode /* cpu has no cpuid */
+ movl $0x0,%eax
+ cpuid
+ cmpl $0x1,%eax
+ jb no_longmode /* no cpuid 1 */
+ xor %di,%di
+ cmpl $0x68747541,%ebx /* AuthenticAMD */
+ jnz noamd
+ cmpl $0x69746e65,%edx
+ jnz noamd
+ cmpl $0x444d4163,%ecx
+ jnz noamd
+ mov $1,%di /* cpu is from AMD */
+noamd:
+ movl $0x1,%eax
+ cpuid
+ andl $REQUIRED_MASK1,%edx
+ xorl $REQUIRED_MASK1,%edx
+ jnz no_longmode
+ movl $0x80000000,%eax
+ cpuid
+ cmpl $0x80000001,%eax
+ jb no_longmode /* no extended cpuid */
+ movl $0x80000001,%eax
+ cpuid
+ andl $REQUIRED_MASK2,%edx
+ xorl $REQUIRED_MASK2,%edx
+ jnz no_longmode
+sse_test:
+ movl $1,%eax
+ cpuid
+ andl $SSE_MASK,%edx
+ cmpl $SSE_MASK,%edx
+ je sse_ok
+ test %di,%di
+ jz no_longmode /* only try to force SSE on AMD */
+ movl $0xc0010015,%ecx /* HWCR */
+ rdmsr
+ btr $15,%eax /* enable SSE */
+ wrmsr
+ xor %di,%di /* don't loop */
+ jmp sse_test /* try again */
+no_longmode:
+ call beep
+ lea long_mode_panic,%si
+ call prtstr
+no_longmode_loop:
+ jmp no_longmode_loop
+long_mode_panic:
+ .string "Your CPU does not support long mode. Use a 32bit distribution."
+ .byte 0
+
+sse_ok:
+ popw %ds
+
+# tell BIOS we want to go to long mode
+ movl $0xec00,%eax # declare target operating mode
+ movl $2,%ebx # long mode
+ int $0x15
+
+# Get memory size (extended mem, kB)
+
+ xorl %eax, %eax
+ movl %eax, (0x1e0)
+#ifndef STANDARD_MEMORY_BIOS_CALL
+ movb %al, (E820NR)
+# Try three different memory detection schemes. First, try
+# e820h, which lets us assemble a memory map, then try e801h,
+# which returns a 32-bit memory size, and finally 88h, which
+# returns 0-64m
+
+# method E820H:
+# the memory map from hell. e820h returns memory classified into
+# a whole bunch of different types, and allows memory holes and
+# everything. We scan through this memory map and build a list
+# of the first 32 memory areas, which we return at [E820MAP].
+# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm
+
+#define SMAP 0x534d4150
+
+meme820:
+ xorl %ebx, %ebx # continuation counter
+ movw $E820MAP, %di # point into the whitelist
+ # so we can have the bios
+ # directly write into it.
+
+jmpe820:
+ movl $0x0000e820, %eax # e820, upper word zeroed
+ movl $SMAP, %edx # ascii 'SMAP'
+ movl $20, %ecx # size of the e820rec
+ pushw %ds # data record.
+ popw %es
+ int $0x15 # make the call
+ jc bail820 # fall to e801 if it fails
+
+ cmpl $SMAP, %eax # check the return is `SMAP'
+ jne bail820 # fall to e801 if it fails
+
+# cmpl $1, 16(%di) # is this usable memory?
+# jne again820
+
+ # If this is usable memory, we save it by simply advancing %di by
+ # sizeof(e820rec).
+ #
+good820:
+ movb (E820NR), %al # up to 32 entries
+ cmpb $E820MAX, %al
+ jnl bail820
+
+ incb (E820NR)
+ movw %di, %ax
+ addw $20, %ax
+ movw %ax, %di
+again820:
+ cmpl $0, %ebx # check to see if
+ jne jmpe820 # %ebx is set to EOF
+bail820:
+
+
+# method E801H:
+# memory size is in 1k chunksizes, to avoid confusing loadlin.
+# we store the 0xe801 memory size in a completely different place,
+# because it will most likely be longer than 16 bits.
+# (use 1e0 because that's what Larry Augustine uses in his
+# alternative new memory detection scheme, and it's sensible
+# to write everything into the same place.)
+
+meme801:
+ stc # fix to work around buggy
+ xorw %cx,%cx # BIOSes which dont clear/set
+ xorw %dx,%dx # carry on pass/error of
+ # e801h memory size call
+ # or merely pass cx,dx though
+ # without changing them.
+ movw $0xe801, %ax
+ int $0x15
+ jc mem88
+
+ cmpw $0x0, %cx # Kludge to handle BIOSes
+ jne e801usecxdx # which report their extended
+ cmpw $0x0, %dx # memory in AX/BX rather than
+ jne e801usecxdx # CX/DX. The spec I have read
+ movw %ax, %cx # seems to indicate AX/BX
+ movw %bx, %dx # are more reasonable anyway...
+
+e801usecxdx:
+ andl $0xffff, %edx # clear sign extend
+ shll $6, %edx # and go from 64k to 1k chunks
+ movl %edx, (0x1e0) # store extended memory size
+ andl $0xffff, %ecx # clear sign extend
+ addl %ecx, (0x1e0) # and add lower memory into
+ # total size.
+
+# Ye Olde Traditional Methode. Returns the memory size (up to 16mb or
+# 64mb, depending on the bios) in ax.
+mem88:
+
+#endif
+ movb $0x88, %ah
+ int $0x15
+ movw %ax, (2)
+
+# Set the keyboard repeat rate to the max
+ movw $0x0305, %ax
+ xorw %bx, %bx
+ int $0x16
+
+# Check for video adapter and its parameters and allow the
+# user to browse video modes.
+ call video # NOTE: we need %ds pointing
+ # to bootsector
+
+# Get hd0 data...
+ xorw %ax, %ax
+ movw %ax, %ds
+ ldsw (4 * 0x41), %si
+ movw %cs, %ax # aka SETUPSEG
+ subw $DELTA_INITSEG, %ax # aka INITSEG
+ pushw %ax
+ movw %ax, %es
+ movw $0x0080, %di
+ movw $0x10, %cx
+ pushw %cx
+ cld
+ rep
+ movsb
+# Get hd1 data...
+ xorw %ax, %ax
+ movw %ax, %ds
+ ldsw (4 * 0x46), %si
+ popw %cx
+ popw %es
+ movw $0x0090, %di
+ rep
+ movsb
+# Check that there IS a hd1 :-)
+ movw $0x01500, %ax
+ movb $0x81, %dl
+ int $0x13
+ jc no_disk1
+
+ cmpb $3, %ah
+ je is_disk1
+
+no_disk1:
+ movw %cs, %ax # aka SETUPSEG
+ subw $DELTA_INITSEG, %ax # aka INITSEG
+ movw %ax, %es
+ movw $0x0090, %di
+ movw $0x10, %cx
+ xorw %ax, %ax
+ cld
+ rep
+ stosb
+is_disk1:
+
+# Check for PS/2 pointing device
+ movw %cs, %ax # aka SETUPSEG
+ subw $DELTA_INITSEG, %ax # aka INITSEG
+ movw %ax, %ds
+ movw $0, (0x1ff) # default is no pointing device
+ int $0x11 # int 0x11: equipment list
+ testb $0x04, %al # check if mouse installed
+ jz no_psmouse
+
+ movw $0xAA, (0x1ff) # device present
+no_psmouse:
+
+#include "../../i386/boot/edd.S"
+
+# Now we want to move to protected mode ...
+ cmpw $0, %cs:realmode_swtch
+ jz rmodeswtch_normal
+
+ lcall *%cs:realmode_swtch
+
+ jmp rmodeswtch_end
+
+rmodeswtch_normal:
+ pushw %cs
+ call default_switch
+
+rmodeswtch_end:
+# we get the code32 start address and modify the below 'jmpi'
+# (loader may have changed it)
+ movl %cs:code32_start, %eax
+ movl %eax, %cs:code32
+
+# Now we move the system to its rightful place ... but we check if we have a
+# big-kernel. In that case we *must* not move it ...
+ testb $LOADED_HIGH, %cs:loadflags
+ jz do_move0 # .. then we have a normal low
+ # loaded zImage
+ # .. or else we have a high
+ # loaded bzImage
+ jmp end_move # ... and we skip moving
+
+do_move0:
+ movw $0x100, %ax # start of destination segment
+ movw %cs, %bp # aka SETUPSEG
+ subw $DELTA_INITSEG, %bp # aka INITSEG
+ movw %cs:start_sys_seg, %bx # start of source segment
+ cld
+do_move:
+ movw %ax, %es # destination segment
+ incb %ah # instead of add ax,#0x100
+ movw %bx, %ds # source segment
+ addw $0x100, %bx
+ subw %di, %di
+ subw %si, %si
+ movw $0x800, %cx
+ rep
+ movsw
+ cmpw %bp, %bx # assume start_sys_seg > 0x200,
+ # so we will perhaps read one
+ # page more than needed, but
+ # never overwrite INITSEG
+ # because destination is a
+ # minimum one page below source
+ jb do_move
+
+end_move:
+# then we load the segment descriptors
+ movw %cs, %ax # aka SETUPSEG
+ movw %ax, %ds
+
+# Check whether we need to be downward compatible with version <=201
+ cmpl $0, cmd_line_ptr
+ jne end_move_self # loader uses version >=202 features
+ cmpb $0x20, type_of_loader
+ je end_move_self # bootsect loader, we know of it
+
+# Boot loader doesnt support boot protocol version 2.02.
+# If we have our code not at 0x90000, we need to move it there now.
+# We also then need to move the params behind it (commandline)
+# Because we would overwrite the code on the current IP, we move
+# it in two steps, jumping high after the first one.
+ movw %cs, %ax
+ cmpw $SETUPSEG, %ax
+ je end_move_self
+
+ cli # make sure we really have
+ # interrupts disabled !
+ # because after this the stack
+ # should not be used
+ subw $DELTA_INITSEG, %ax # aka INITSEG
+ movw %ss, %dx
+ cmpw %ax, %dx
+ jb move_self_1
+
+ addw $INITSEG, %dx
+ subw %ax, %dx # this will go into %ss after
+ # the move
+move_self_1:
+ movw %ax, %ds
+ movw $INITSEG, %ax # real INITSEG
+ movw %ax, %es
+ movw %cs:setup_move_size, %cx
+ std # we have to move up, so we use
+ # direction down because the
+ # areas may overlap
+ movw %cx, %di
+ decw %di
+ movw %di, %si
+ subw $move_self_here+0x200, %cx
+ rep
+ movsb
+ ljmp $SETUPSEG, $move_self_here
+
+move_self_here:
+ movw $move_self_here+0x200, %cx
+ rep
+ movsb
+ movw $SETUPSEG, %ax
+ movw %ax, %ds
+ movw %dx, %ss
+end_move_self: # now we are at the right place
+ lidt idt_48 # load idt with 0,0
+ xorl %eax, %eax # Compute gdt_base
+ movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
+ shll $4, %eax
+ addl $gdt, %eax
+ movl %eax, (gdt_48+2)
+ lgdt gdt_48 # load gdt with whatever is
+ # appropriate
+
+# that was painless, now we enable a20
+ call empty_8042
+
+ movb $0xD1, %al # command write
+ outb %al, $0x64
+ call empty_8042
+
+ movb $0xDF, %al # A20 on
+ outb %al, $0x60
+ call empty_8042
+
+#
+# You must preserve the other bits here. Otherwise embarrasing things
+# like laptops powering off on boot happen. Corrected version by Kira
+# Brown from Linux 2.2
+#
+ inb $0x92, %al #
+ orb $02, %al # "fast A20" version
+ outb %al, $0x92 # some chips have only this
+
+# wait until a20 really *is* enabled; it can take a fair amount of
+# time on certain systems; Toshiba Tecras are known to have this
+# problem. The memory location used here (0x200) is the int 0x80
+# vector, which should be safe to use.
+
+ xorw %ax, %ax # segment 0x0000
+ movw %ax, %fs
+ decw %ax # segment 0xffff (HMA)
+ movw %ax, %gs
+a20_wait:
+ incw %ax # unused memory location <0xfff0
+ movw %ax, %fs:(0x200) # we use the "int 0x80" vector
+ cmpw %gs:(0x210), %ax # and its corresponding HMA addr
+ je a20_wait # loop until no longer aliased
+
+# make sure any possible coprocessor is properly reset..
+ xorw %ax, %ax
+ outb %al, $0xf0
+ call delay
+
+ outb %al, $0xf1
+ call delay
+
+# well, that went ok, I hope. Now we mask all interrupts - the rest
+# is done in init_IRQ().
+ movb $0xFF, %al # mask all interrupts for now
+ outb %al, $0xA1
+ call delay
+
+ movb $0xFB, %al # mask all irq's but irq2 which
+ outb %al, $0x21 # is cascaded
+
+# Well, that certainly wasn't fun :-(. Hopefully it works, and we don't
+# need no steenking BIOS anyway (except for the initial loading :-).
+# The BIOS-routine wants lots of unnecessary data, and it's less
+# "interesting" anyway. This is how REAL programmers do it.
+#
+# Well, now's the time to actually move into protected mode. To make
+# things as simple as possible, we do no register set-up or anything,
+# we let the gnu-compiled 32-bit programs do that. We just jump to
+# absolute address 0x1000 (or the loader supplied one),
+# in 32-bit protected mode.
+#
+# Note that the short jump isn't strictly needed, although there are
+# reasons why it might be a good idea. It won't hurt in any case.
+ movw $1, %ax # protected mode (PE) bit
+ lmsw %ax # This is it!
+ jmp flush_instr
+
+flush_instr:
+ xorw %bx, %bx # Flag to indicate a boot
+ xorl %esi, %esi # Pointer to real-mode code
+ movw %cs, %si
+ subw $DELTA_INITSEG, %si
+ shll $4, %esi # Convert to 32-bit pointer
+# NOTE: For high loaded big kernels we need a
+# jmpi 0x100000,__KERNEL_CS
+#
+# but we yet haven't reloaded the CS register, so the default size
+# of the target offset still is 16 bit.
+# However, using an operant prefix (0x66), the CPU will properly
+# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
+# Manual, Mixing 16-bit and 32-bit code, page 16-6)
+
+ .byte 0x66, 0xea # prefix + jmpi-opcode
+code32: .long 0x1000 # will be set to 0x100000
+ # for big kernels
+ .word __KERNEL_CS
+
+# Here's a bunch of information about your current kernel..
+kernel_version: .ascii UTS_RELEASE
+ .ascii " ("
+ .ascii LINUX_COMPILE_BY
+ .ascii "@"
+ .ascii LINUX_COMPILE_HOST
+ .ascii ") "
+ .ascii UTS_VERSION
+ .byte 0
+
+# This is the default real mode switch routine.
+# to be called just before protected mode transition
+default_switch:
+ cli # no interrupts allowed !
+ movb $0x80, %al # disable NMI for bootup
+ # sequence
+ outb %al, $0x70
+ lret
+
+
+# This routine checks that the keyboard command queue is empty
+# (after emptying the output buffers)
+#
+# Some machines have delusions that the keyboard buffer is always full
+# with no keyboard attached...
+#
+# If there is no keyboard controller, we will usually get 0xff
+# to all the reads. With each IO taking a microsecond and
+# a timeout of 100,000 iterations, this can take about half a
+# second ("delay" == outb to port 0x80). That should be ok,
+# and should also be plenty of time for a real keyboard controller
+# to empty.
+#
+
+empty_8042:
+ pushl %ecx
+ movl $100000, %ecx
+
+empty_8042_loop:
+ decl %ecx
+ jz empty_8042_end_loop
+
+ call delay
+
+ inb $0x64, %al # 8042 status port
+ testb $1, %al # output buffer?
+ jz no_output
+
+ call delay
+ inb $0x60, %al # read it
+ jmp empty_8042_loop
+
+no_output:
+ testb $2, %al # is input buffer full?
+ jnz empty_8042_loop # yes - loop
+empty_8042_end_loop:
+ popl %ecx
+ ret
+
+# Read the cmos clock. Return the seconds in al
+gettime:
+ pushw %cx
+ movb $0x02, %ah
+ int $0x1a
+ movb %dh, %al # %dh contains the seconds
+ andb $0x0f, %al
+ movb %dh, %ah
+ movb $0x04, %cl
+ shrb %cl, %ah
+ aad
+ popw %cx
+ ret
+
+# Delay is needed after doing I/O
+delay:
+ outb %al,$0x80
+ ret
+
+# Descriptor tables
+gdt:
+ .word 0, 0, 0, 0 # dummy
+
+ .word 0, 0, 0, 0 # unused
+
+ .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
+ .word 0 # base address = 0
+ .word 0x9A00 # code read/exec
+ .word 0x00CF # granularity = 4096, 386
+ # (+5th nibble of limit)
+
+ .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
+ .word 0 # base address = 0
+ .word 0x9200 # data read/write
+ .word 0x00CF # granularity = 4096, 386
+ # (+5th nibble of limit)
+idt_48:
+ .word 0 # idt limit = 0
+ .word 0, 0 # idt base = 0L
+gdt_48:
+ .word 0x8000 # gdt limit=2048,
+ # 256 GDT entries
+
+ .word 0, 0 # gdt base (filled in later)
+
+# Include video setup & detection code
+
+#include "video.S"
+
+# Setup signature -- must be last
+setup_sig1: .word SIG1
+setup_sig2: .word SIG2
+
+# After this point, there is some free space which is used by the video mode
+# handling code to store the temporary mode table (not used by the kernel).
+
+modelist:
+
+.text
+endtext:
+.data
+enddata:
+.bss
+endbss:
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
new file mode 100644
index 000000000000..c2fa66313170
--- /dev/null
+++ b/arch/x86_64/boot/tools/build.c
@@ -0,0 +1,186 @@
+/*
+ * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 1997 Martin Mares
+ */
+
+/*
+ * This file builds a disk-image from three different files:
+ *
+ * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest
+ * - setup: 8086 machine code, sets up system parm
+ * - system: 80386 code for actual system
+ *
+ * It does some checking that all files are of the correct type, and
+ * just writes the result to stdout, removing headers and padding to
+ * the right amount. It also writes some system data to stderr.
+ */
+
+/*
+ * Changes by tytso to allow root device specification
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ * Cross compiling fixes by Gertjan van Wingerde, July 1996
+ * Rewritten by Martin Mares, April 1997
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <asm/boot.h>
+
+typedef unsigned char byte;
+typedef unsigned short word;
+typedef unsigned long u32;
+
+#define DEFAULT_MAJOR_ROOT 0
+#define DEFAULT_MINOR_ROOT 0
+
+/* Minimal number of setup sectors (see also bootsect.S) */
+#define SETUP_SECTS 4
+
+byte buf[1024];
+int fd;
+int is_big_kernel;
+
+void die(const char * str, ...)
+{
+ va_list args;
+ va_start(args, str);
+ vfprintf(stderr, str, args);
+ fputc('\n', stderr);
+ exit(1);
+}
+
+void file_open(const char *name)
+{
+ if ((fd = open(name, O_RDONLY, 0)) < 0)
+ die("Unable to open `%s': %m", name);
+}
+
+void usage(void)
+{
+ die("Usage: build [-b] bootsect setup system [rootdev] [> image]");
+}
+
+int main(int argc, char ** argv)
+{
+ unsigned int i, c, sz, setup_sectors;
+ u32 sys_size;
+ byte major_root, minor_root;
+ struct stat sb;
+
+ if (argc > 2 && !strcmp(argv[1], "-b"))
+ {
+ is_big_kernel = 1;
+ argc--, argv++;
+ }
+ if ((argc < 4) || (argc > 5))
+ usage();
+ if (argc > 4) {
+ if (!strcmp(argv[4], "CURRENT")) {
+ if (stat("/", &sb)) {
+ perror("/");
+ die("Couldn't stat /");
+ }
+ major_root = major(sb.st_dev);
+ minor_root = minor(sb.st_dev);
+ } else if (strcmp(argv[4], "FLOPPY")) {
+ if (stat(argv[4], &sb)) {
+ perror(argv[4]);
+ die("Couldn't stat root device.");
+ }
+ major_root = major(sb.st_rdev);
+ minor_root = minor(sb.st_rdev);
+ } else {
+ major_root = 0;
+ minor_root = 0;
+ }
+ } else {
+ major_root = DEFAULT_MAJOR_ROOT;
+ minor_root = DEFAULT_MINOR_ROOT;
+ }
+ fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
+
+ file_open(argv[1]);
+ i = read(fd, buf, sizeof(buf));
+ fprintf(stderr,"Boot sector %d bytes.\n",i);
+ if (i != 512)
+ die("Boot block must be exactly 512 bytes");
+ if (buf[510] != 0x55 || buf[511] != 0xaa)
+ die("Boot block hasn't got boot flag (0xAA55)");
+ buf[508] = minor_root;
+ buf[509] = major_root;
+ if (write(1, buf, 512) != 512)
+ die("Write call failed");
+ close (fd);
+
+ file_open(argv[2]); /* Copy the setup code */
+ for (i=0 ; (c=read(fd, buf, sizeof(buf)))>0 ; i+=c )
+ if (write(1, buf, c) != c)
+ die("Write call failed");
+ if (c != 0)
+ die("read-error on `setup'");
+ close (fd);
+
+ setup_sectors = (i + 511) / 512; /* Pad unused space with zeros */
+ /* for compatibility with ancient versions of LILO. */
+ if (setup_sectors < SETUP_SECTS)
+ setup_sectors = SETUP_SECTS;
+ fprintf(stderr, "Setup is %d bytes.\n", i);
+ memset(buf, 0, sizeof(buf));
+ while (i < setup_sectors * 512) {
+ c = setup_sectors * 512 - i;
+ if (c > sizeof(buf))
+ c = sizeof(buf);
+ if (write(1, buf, c) != c)
+ die("Write call failed");
+ i += c;
+ }
+
+ file_open(argv[3]);
+ if (fstat (fd, &sb))
+ die("Unable to stat `%s': %m", argv[3]);
+ sz = sb.st_size;
+ fprintf (stderr, "System is %d kB\n", sz/1024);
+ sys_size = (sz + 15) / 16;
+ /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */
+ if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE))
+ die("System is too big. Try using %smodules.",
+ is_big_kernel ? "" : "bzImage or ");
+ while (sz > 0) {
+ int l, n;
+
+ l = (sz > sizeof(buf)) ? sizeof(buf) : sz;
+ if ((n=read(fd, buf, l)) != l) {
+ if (n < 0)
+ die("Error reading %s: %m", argv[3]);
+ else
+ die("%s: Unexpected EOF", argv[3]);
+ }
+ if (write(1, buf, l) != l)
+ die("Write failed");
+ sz -= l;
+ }
+ close(fd);
+
+ if (lseek(1, 497, SEEK_SET) != 497) /* Write sizes to the bootsector */
+ die("Output: seek failed");
+ buf[0] = setup_sectors;
+ if (write(1, buf, 1) != 1)
+ die("Write of setup sector count failed");
+ if (lseek(1, 500, SEEK_SET) != 500)
+ die("Output: seek failed");
+ buf[0] = (sys_size & 0xff);
+ buf[1] = ((sys_size >> 8) & 0xff);
+ if (write(1, buf, 2) != 2)
+ die("Write of image length failed");
+
+ return 0; /* Everything is OK */
+}
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
new file mode 100644
index 000000000000..0587477c99f2
--- /dev/null
+++ b/arch/x86_64/boot/video.S
@@ -0,0 +1,2007 @@
+/* video.S
+ *
+ * Display adapter & video mode setup, version 2.13 (14-May-99)
+ *
+ * Copyright (C) 1995 -- 1998 Martin Mares <mj@ucw.cz>
+ * Based on the original setup.S code (C) Linus Torvalds and Mats Anderson
+ *
+ * Rewritten to use GNU 'as' by Chris Noe <stiker@northlink.com> May 1999
+ *
+ * For further information, look at Documentation/svga.txt.
+ *
+ */
+
+#include <linux/config.h> /* for CONFIG_VIDEO_* */
+
+/* Enable autodetection of SVGA adapters and modes. */
+#undef CONFIG_VIDEO_SVGA
+
+/* Enable autodetection of VESA modes */
+#define CONFIG_VIDEO_VESA
+
+/* Enable compacting of mode table */
+#define CONFIG_VIDEO_COMPACT
+
+/* Retain screen contents when switching modes */
+#define CONFIG_VIDEO_RETAIN
+
+/* Enable local mode list */
+#undef CONFIG_VIDEO_LOCAL
+
+/* Force 400 scan lines for standard modes (hack to fix bad BIOS behaviour */
+#undef CONFIG_VIDEO_400_HACK
+
+/* Hack that lets you force specific BIOS mode ID and specific dimensions */
+#undef CONFIG_VIDEO_GFX_HACK
+#define VIDEO_GFX_BIOS_AX 0x4f02 /* 800x600 on ThinkPad */
+#define VIDEO_GFX_BIOS_BX 0x0102
+#define VIDEO_GFX_DUMMY_RESOLUTION 0x6425 /* 100x37 */
+
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ * NORMAL_VGA (-1)
+ * EXTENDED_VGA (-2)
+ * ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+
+/* Special video modes */
+#define VIDEO_FIRST_SPECIAL 0x0f00
+#define VIDEO_80x25 0x0f00
+#define VIDEO_8POINT 0x0f01
+#define VIDEO_80x43 0x0f02
+#define VIDEO_80x28 0x0f03
+#define VIDEO_CURRENT_MODE 0x0f04
+#define VIDEO_80x30 0x0f05
+#define VIDEO_80x34 0x0f06
+#define VIDEO_80x60 0x0f07
+#define VIDEO_GFX_HACK 0x0f08
+#define VIDEO_LAST_SPECIAL 0x0f09
+
+/* Video modes given by resolution */
+#define VIDEO_FIRST_RESOLUTION 0x1000
+
+/* The "recalculate timings" flag */
+#define VIDEO_RECALC 0x8000
+
+/* Positions of various video parameters passed to the kernel */
+/* (see also include/linux/tty.h) */
+#define PARAM_CURSOR_POS 0x00
+#define PARAM_VIDEO_PAGE 0x04
+#define PARAM_VIDEO_MODE 0x06
+#define PARAM_VIDEO_COLS 0x07
+#define PARAM_VIDEO_EGA_BX 0x0a
+#define PARAM_VIDEO_LINES 0x0e
+#define PARAM_HAVE_VGA 0x0f
+#define PARAM_FONT_POINTS 0x10
+
+#define PARAM_LFB_WIDTH 0x12
+#define PARAM_LFB_HEIGHT 0x14
+#define PARAM_LFB_DEPTH 0x16
+#define PARAM_LFB_BASE 0x18
+#define PARAM_LFB_SIZE 0x1c
+#define PARAM_LFB_LINELENGTH 0x24
+#define PARAM_LFB_COLORS 0x26
+#define PARAM_VESAPM_SEG 0x2e
+#define PARAM_VESAPM_OFF 0x30
+#define PARAM_LFB_PAGES 0x32
+#define PARAM_VESA_ATTRIB 0x34
+
+/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
+#ifdef CONFIG_VIDEO_RETAIN
+#define DO_STORE call store_screen
+#else
+#define DO_STORE
+#endif /* CONFIG_VIDEO_RETAIN */
+
+# This is the main entry point called by setup.S
+# %ds *must* be pointing to the bootsector
+video: pushw %ds # We use different segments
+ pushw %ds # FS contains original DS
+ popw %fs
+ pushw %cs # DS is equal to CS
+ popw %ds
+ pushw %cs # ES is equal to CS
+ popw %es
+ xorw %ax, %ax
+ movw %ax, %gs # GS is zero
+ cld
+ call basic_detect # Basic adapter type testing (EGA/VGA/MDA/CGA)
+#ifdef CONFIG_VIDEO_SELECT
+ movw %fs:(0x01fa), %ax # User selected video mode
+ cmpw $ASK_VGA, %ax # Bring up the menu
+ jz vid2
+
+ call mode_set # Set the mode
+ jc vid1
+
+ leaw badmdt, %si # Invalid mode ID
+ call prtstr
+vid2: call mode_menu
+vid1:
+#ifdef CONFIG_VIDEO_RETAIN
+ call restore_screen # Restore screen contents
+#endif /* CONFIG_VIDEO_RETAIN */
+ call store_edid
+#endif /* CONFIG_VIDEO_SELECT */
+ call mode_params # Store mode parameters
+ popw %ds # Restore original DS
+ ret
+
+# Detect if we have CGA, MDA, EGA or VGA and pass it to the kernel.
+basic_detect:
+ movb $0, %fs:(PARAM_HAVE_VGA)
+ movb $0x12, %ah # Check EGA/VGA
+ movb $0x10, %bl
+ int $0x10
+ movw %bx, %fs:(PARAM_VIDEO_EGA_BX) # Identifies EGA to the kernel
+ cmpb $0x10, %bl # No, it's a CGA/MDA/HGA card.
+ je basret
+
+ incb adapter
+ movw $0x1a00, %ax # Check EGA or VGA?
+ int $0x10
+ cmpb $0x1a, %al # 1a means VGA...
+ jne basret # anything else is EGA.
+
+ incb %fs:(PARAM_HAVE_VGA) # We've detected a VGA
+ incb adapter
+basret: ret
+
+# Store the video mode parameters for later usage by the kernel.
+# This is done by asking the BIOS except for the rows/columns
+# parameters in the default 80x25 mode -- these are set directly,
+# because some very obscure BIOSes supply insane values.
+mode_params:
+#ifdef CONFIG_VIDEO_SELECT
+ cmpb $0, graphic_mode
+ jnz mopar_gr
+#endif
+ movb $0x03, %ah # Read cursor position
+ xorb %bh, %bh
+ int $0x10
+ movw %dx, %fs:(PARAM_CURSOR_POS)
+ movb $0x0f, %ah # Read page/mode/width
+ int $0x10
+ movw %bx, %fs:(PARAM_VIDEO_PAGE)
+ movw %ax, %fs:(PARAM_VIDEO_MODE) # Video mode and screen width
+ cmpb $0x7, %al # MDA/HGA => segment differs
+ jnz mopar0
+
+ movw $0xb000, video_segment
+mopar0: movw %gs:(0x485), %ax # Font size
+ movw %ax, %fs:(PARAM_FONT_POINTS) # (valid only on EGA/VGA)
+ movw force_size, %ax # Forced size?
+ orw %ax, %ax
+ jz mopar1
+
+ movb %ah, %fs:(PARAM_VIDEO_COLS)
+ movb %al, %fs:(PARAM_VIDEO_LINES)
+ ret
+
+mopar1: movb $25, %al
+ cmpb $0, adapter # If we are on CGA/MDA/HGA, the
+ jz mopar2 # screen must have 25 lines.
+
+ movb %gs:(0x484), %al # On EGA/VGA, use the EGA+ BIOS
+ incb %al # location of max lines.
+mopar2: movb %al, %fs:(PARAM_VIDEO_LINES)
+ ret
+
+#ifdef CONFIG_VIDEO_SELECT
+# Fetching of VESA frame buffer parameters
+mopar_gr:
+ leaw modelist+1024, %di
+ movb $0x23, %fs:(PARAM_HAVE_VGA)
+ movw 16(%di), %ax
+ movw %ax, %fs:(PARAM_LFB_LINELENGTH)
+ movw 18(%di), %ax
+ movw %ax, %fs:(PARAM_LFB_WIDTH)
+ movw 20(%di), %ax
+ movw %ax, %fs:(PARAM_LFB_HEIGHT)
+ movb 25(%di), %al
+ movb $0, %ah
+ movw %ax, %fs:(PARAM_LFB_DEPTH)
+ movb 29(%di), %al
+ movb $0, %ah
+ movw %ax, %fs:(PARAM_LFB_PAGES)
+ movl 40(%di), %eax
+ movl %eax, %fs:(PARAM_LFB_BASE)
+ movl 31(%di), %eax
+ movl %eax, %fs:(PARAM_LFB_COLORS)
+ movl 35(%di), %eax
+ movl %eax, %fs:(PARAM_LFB_COLORS+4)
+ movw 0(%di), %ax
+ movw %ax, %fs:(PARAM_VESA_ATTRIB)
+
+# get video mem size
+ leaw modelist+1024, %di
+ movw $0x4f00, %ax
+ int $0x10
+ xorl %eax, %eax
+ movw 18(%di), %ax
+ movl %eax, %fs:(PARAM_LFB_SIZE)
+
+# switching the DAC to 8-bit is for <= 8 bpp only
+ movw %fs:(PARAM_LFB_DEPTH), %ax
+ cmpw $8, %ax
+ jg dac_done
+
+# get DAC switching capability
+ xorl %eax, %eax
+ movb 10(%di), %al
+ testb $1, %al
+ jz dac_set
+
+# attempt to switch DAC to 8-bit
+ movw $0x4f08, %ax
+ movw $0x0800, %bx
+ int $0x10
+ cmpw $0x004f, %ax
+ jne dac_set
+ movb %bh, dac_size # store actual DAC size
+
+dac_set:
+# set color size to DAC size
+ movb dac_size, %al
+ movb %al, %fs:(PARAM_LFB_COLORS+0)
+ movb %al, %fs:(PARAM_LFB_COLORS+2)
+ movb %al, %fs:(PARAM_LFB_COLORS+4)
+ movb %al, %fs:(PARAM_LFB_COLORS+6)
+
+# set color offsets to 0
+ movb $0, %fs:(PARAM_LFB_COLORS+1)
+ movb $0, %fs:(PARAM_LFB_COLORS+3)
+ movb $0, %fs:(PARAM_LFB_COLORS+5)
+ movb $0, %fs:(PARAM_LFB_COLORS+7)
+
+dac_done:
+# get protected mode interface informations
+ movw $0x4f0a, %ax
+ xorw %bx, %bx
+ xorw %di, %di
+ int $0x10
+ cmp $0x004f, %ax
+ jnz no_pm
+
+ movw %es, %fs:(PARAM_VESAPM_SEG)
+ movw %di, %fs:(PARAM_VESAPM_OFF)
+no_pm: ret
+
+# The video mode menu
+mode_menu:
+ leaw keymsg, %si # "Return/Space/Timeout" message
+ call prtstr
+ call flush
+nokey: call getkt
+
+ cmpb $0x0d, %al # ENTER ?
+ je listm # yes - manual mode selection
+
+ cmpb $0x20, %al # SPACE ?
+ je defmd1 # no - repeat
+
+ call beep
+ jmp nokey
+
+defmd1: ret # No mode chosen? Default 80x25
+
+listm: call mode_table # List mode table
+listm0: leaw name_bann, %si # Print adapter name
+ call prtstr
+ movw card_name, %si
+ orw %si, %si
+ jnz an2
+
+ movb adapter, %al
+ leaw old_name, %si
+ orb %al, %al
+ jz an1
+
+ leaw ega_name, %si
+ decb %al
+ jz an1
+
+ leaw vga_name, %si
+ jmp an1
+
+an2: call prtstr
+ leaw svga_name, %si
+an1: call prtstr
+ leaw listhdr, %si # Table header
+ call prtstr
+ movb $0x30, %dl # DL holds mode number
+ leaw modelist, %si
+lm1: cmpw $ASK_VGA, (%si) # End?
+ jz lm2
+
+ movb %dl, %al # Menu selection number
+ call prtchr
+ call prtsp2
+ lodsw
+ call prthw # Mode ID
+ call prtsp2
+ movb 0x1(%si), %al
+ call prtdec # Rows
+ movb $0x78, %al # the letter 'x'
+ call prtchr
+ lodsw
+ call prtdec # Columns
+ movb $0x0d, %al # New line
+ call prtchr
+ movb $0x0a, %al
+ call prtchr
+ incb %dl # Next character
+ cmpb $0x3a, %dl
+ jnz lm1
+
+ movb $0x61, %dl
+ jmp lm1
+
+lm2: leaw prompt, %si # Mode prompt
+ call prtstr
+ leaw edit_buf, %di # Editor buffer
+lm3: call getkey
+ cmpb $0x0d, %al # Enter?
+ jz lment
+
+ cmpb $0x08, %al # Backspace?
+ jz lmbs
+
+ cmpb $0x20, %al # Printable?
+ jc lm3
+
+ cmpw $edit_buf+4, %di # Enough space?
+ jz lm3
+
+ stosb
+ call prtchr
+ jmp lm3
+
+lmbs: cmpw $edit_buf, %di # Backspace
+ jz lm3
+
+ decw %di
+ movb $0x08, %al
+ call prtchr
+ call prtspc
+ movb $0x08, %al
+ call prtchr
+ jmp lm3
+
+lment: movb $0, (%di)
+ leaw crlft, %si
+ call prtstr
+ leaw edit_buf, %si
+ cmpb $0, (%si) # Empty string = default mode
+ jz lmdef
+
+ cmpb $0, 1(%si) # One character = menu selection
+ jz mnusel
+
+ cmpw $0x6373, (%si) # "scan" => mode scanning
+ jnz lmhx
+
+ cmpw $0x6e61, 2(%si)
+ jz lmscan
+
+lmhx: xorw %bx, %bx # Else => mode ID in hex
+lmhex: lodsb
+ orb %al, %al
+ jz lmuse1
+
+ subb $0x30, %al
+ jc lmbad
+
+ cmpb $10, %al
+ jc lmhx1
+
+ subb $7, %al
+ andb $0xdf, %al
+ cmpb $10, %al
+ jc lmbad
+
+ cmpb $16, %al
+ jnc lmbad
+
+lmhx1: shlw $4, %bx
+ orb %al, %bl
+ jmp lmhex
+
+lmuse1: movw %bx, %ax
+ jmp lmuse
+
+mnusel: lodsb # Menu selection
+ xorb %ah, %ah
+ subb $0x30, %al
+ jc lmbad
+
+ cmpb $10, %al
+ jc lmuse
+
+ cmpb $0x61-0x30, %al
+ jc lmbad
+
+ subb $0x61-0x30-10, %al
+ cmpb $36, %al
+ jnc lmbad
+
+lmuse: call mode_set
+ jc lmdef
+
+lmbad: leaw unknt, %si
+ call prtstr
+ jmp lm2
+lmscan: cmpb $0, adapter # Scanning only on EGA/VGA
+ jz lmbad
+
+ movw $0, mt_end # Scanning of modes is
+ movb $1, scanning # done as new autodetection.
+ call mode_table
+ jmp listm0
+lmdef: ret
+
+# Additional parts of mode_set... (relative jumps, you know)
+setv7: # Video7 extended modes
+ DO_STORE
+ subb $VIDEO_FIRST_V7>>8, %bh
+ movw $0x6f05, %ax
+ int $0x10
+ stc
+ ret
+
+_setrec: jmp setrec # Ugly...
+_set_80x25: jmp set_80x25
+
+# Aliases for backward compatibility.
+setalias:
+ movw $VIDEO_80x25, %ax
+ incw %bx
+ jz mode_set
+
+ movb $VIDEO_8POINT-VIDEO_FIRST_SPECIAL, %al
+ incw %bx
+ jnz setbad # Fall-through!
+
+# Setting of user mode (AX=mode ID) => CF=success
+mode_set:
+ movw %ax, %fs:(0x01fa) # Store mode for use in acpi_wakeup.S
+ movw %ax, %bx
+ cmpb $0xff, %ah
+ jz setalias
+
+ testb $VIDEO_RECALC>>8, %ah
+ jnz _setrec
+
+ cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah
+ jnc setres
+
+ cmpb $VIDEO_FIRST_SPECIAL>>8, %ah
+ jz setspc
+
+ cmpb $VIDEO_FIRST_V7>>8, %ah
+ jz setv7
+
+ cmpb $VIDEO_FIRST_VESA>>8, %ah
+ jnc check_vesa
+
+ orb %ah, %ah
+ jz setmenu
+
+ decb %ah
+ jz setbios
+
+setbad: clc
+ movb $0, do_restore # The screen needn't be restored
+ ret
+
+setvesa:
+ DO_STORE
+ subb $VIDEO_FIRST_VESA>>8, %bh
+ movw $0x4f02, %ax # VESA BIOS mode set call
+ int $0x10
+ cmpw $0x004f, %ax # AL=4f if implemented
+ jnz setbad # AH=0 if OK
+
+ stc
+ ret
+
+setbios:
+ DO_STORE
+ int $0x10 # Standard BIOS mode set call
+ pushw %bx
+ movb $0x0f, %ah # Check if really set
+ int $0x10
+ popw %bx
+ cmpb %bl, %al
+ jnz setbad
+
+ stc
+ ret
+
+setspc: xorb %bh, %bh # Set special mode
+ cmpb $VIDEO_LAST_SPECIAL-VIDEO_FIRST_SPECIAL, %bl
+ jnc setbad
+
+ addw %bx, %bx
+ jmp *spec_inits(%bx)
+
+setmenu:
+ orb %al, %al # 80x25 is an exception
+ jz _set_80x25
+
+ pushw %bx # Set mode chosen from menu
+ call mode_table # Build the mode table
+ popw %ax
+ shlw $2, %ax
+ addw %ax, %si
+ cmpw %di, %si
+ jnc setbad
+
+ movw (%si), %ax # Fetch mode ID
+_m_s: jmp mode_set
+
+setres: pushw %bx # Set mode chosen by resolution
+ call mode_table
+ popw %bx
+ xchgb %bl, %bh
+setr1: lodsw
+ cmpw $ASK_VGA, %ax # End of the list?
+ jz setbad
+
+ lodsw
+ cmpw %bx, %ax
+ jnz setr1
+
+ movw -4(%si), %ax # Fetch mode ID
+ jmp _m_s
+
+check_vesa:
+ leaw modelist+1024, %di
+ subb $VIDEO_FIRST_VESA>>8, %bh
+ movw %bx, %cx # Get mode information structure
+ movw $0x4f01, %ax
+ int $0x10
+ addb $VIDEO_FIRST_VESA>>8, %bh
+ cmpw $0x004f, %ax
+ jnz setbad
+
+ movb (%di), %al # Check capabilities.
+ andb $0x19, %al
+ cmpb $0x09, %al
+ jz setvesa # This is a text mode
+
+ movb (%di), %al # Check capabilities.
+ andb $0x99, %al
+ cmpb $0x99, %al
+ jnz _setbad # Doh! No linear frame buffer.
+
+ subb $VIDEO_FIRST_VESA>>8, %bh
+ orw $0x4000, %bx # Use linear frame buffer
+ movw $0x4f02, %ax # VESA BIOS mode set call
+ int $0x10
+ cmpw $0x004f, %ax # AL=4f if implemented
+ jnz _setbad # AH=0 if OK
+
+ movb $1, graphic_mode # flag graphic mode
+ movb $0, do_restore # no screen restore
+ stc
+ ret
+
+_setbad: jmp setbad # Ugly...
+
+# Recalculate vertical display end registers -- this fixes various
+# inconsistencies of extended modes on many adapters. Called when
+# the VIDEO_RECALC flag is set in the mode ID.
+
+setrec: subb $VIDEO_RECALC>>8, %ah # Set the base mode
+ call mode_set
+ jnc rct3
+
+ movw %gs:(0x485), %ax # Font size in pixels
+ movb %gs:(0x484), %bl # Number of rows
+ incb %bl
+ mulb %bl # Number of visible
+ decw %ax # scan lines - 1
+ movw $0x3d4, %dx
+ movw %ax, %bx
+ movb $0x12, %al # Lower 8 bits
+ movb %bl, %ah
+ outw %ax, %dx
+ movb $0x07, %al # Bits 8 and 9 in the overflow register
+ call inidx
+ xchgb %al, %ah
+ andb $0xbd, %ah
+ shrb %bh
+ jnc rct1
+ orb $0x02, %ah
+rct1: shrb %bh
+ jnc rct2
+ orb $0x40, %ah
+rct2: movb $0x07, %al
+ outw %ax, %dx
+ stc
+rct3: ret
+
+# Table of routines for setting of the special modes.
+spec_inits:
+ .word set_80x25
+ .word set_8pixel
+ .word set_80x43
+ .word set_80x28
+ .word set_current
+ .word set_80x30
+ .word set_80x34
+ .word set_80x60
+ .word set_gfx
+
+# Set the 80x25 mode. If already set, do nothing.
+set_80x25:
+ movw $0x5019, force_size # Override possibly broken BIOS
+use_80x25:
+#ifdef CONFIG_VIDEO_400_HACK
+ movw $0x1202, %ax # Force 400 scan lines
+ movb $0x30, %bl
+ int $0x10
+#else
+ movb $0x0f, %ah # Get current mode ID
+ int $0x10
+ cmpw $0x5007, %ax # Mode 7 (80x25 mono) is the only one available
+ jz st80 # on CGA/MDA/HGA and is also available on EGAM
+
+ cmpw $0x5003, %ax # Unknown mode, force 80x25 color
+ jnz force3
+
+st80: cmpb $0, adapter # CGA/MDA/HGA => mode 3/7 is always 80x25
+ jz set80
+
+ movb %gs:(0x0484), %al # This is EGA+ -- beware of 80x50 etc.
+ orb %al, %al # Some buggy BIOS'es set 0 rows
+ jz set80
+
+ cmpb $24, %al # It's hopefully correct
+ jz set80
+#endif /* CONFIG_VIDEO_400_HACK */
+force3: DO_STORE
+ movw $0x0003, %ax # Forced set
+ int $0x10
+set80: stc
+ ret
+
+# Set the 80x50/80x43 8-pixel mode. Simple BIOS calls.
+set_8pixel:
+ DO_STORE
+ call use_80x25 # The base is 80x25
+set_8pt:
+ movw $0x1112, %ax # Use 8x8 font
+ xorb %bl, %bl
+ int $0x10
+ movw $0x1200, %ax # Use alternate print screen
+ movb $0x20, %bl
+ int $0x10
+ movw $0x1201, %ax # Turn off cursor emulation
+ movb $0x34, %bl
+ int $0x10
+ movb $0x01, %ah # Define cursor scan lines 6-7
+ movw $0x0607, %cx
+ int $0x10
+set_current:
+ stc
+ ret
+
+# Set the 80x28 mode. This mode works on all VGA's, because it's a standard
+# 80x25 mode with 14-point fonts instead of 16-point.
+set_80x28:
+ DO_STORE
+ call use_80x25 # The base is 80x25
+set14: movw $0x1111, %ax # Use 9x14 font
+ xorb %bl, %bl
+ int $0x10
+ movb $0x01, %ah # Define cursor scan lines 11-12
+ movw $0x0b0c, %cx
+ int $0x10
+ stc
+ ret
+
+# Set the 80x43 mode. This mode is works on all VGA's.
+# It's a 350-scanline mode with 8-pixel font.
+set_80x43:
+ DO_STORE
+ movw $0x1201, %ax # Set 350 scans
+ movb $0x30, %bl
+ int $0x10
+ movw $0x0003, %ax # Reset video mode
+ int $0x10
+ jmp set_8pt # Use 8-pixel font
+
+# Set the 80x30 mode (all VGA's). 480 scanlines, 16-pixel font.
+set_80x30:
+ call use_80x25 # Start with real 80x25
+ DO_STORE
+ movw $0x3cc, %dx # Get CRTC port
+ inb %dx, %al
+ movb $0xd4, %dl
+ rorb %al # Mono or color?
+ jc set48a
+
+ movb $0xb4, %dl
+set48a: movw $0x0c11, %ax # Vertical sync end (also unlocks CR0-7)
+ call outidx
+ movw $0x0b06, %ax # Vertical total
+ call outidx
+ movw $0x3e07, %ax # (Vertical) overflow
+ call outidx
+ movw $0xea10, %ax # Vertical sync start
+ call outidx
+ movw $0xdf12, %ax # Vertical display end
+ call outidx
+ movw $0xe715, %ax # Vertical blank start
+ call outidx
+ movw $0x0416, %ax # Vertical blank end
+ call outidx
+ pushw %dx
+ movb $0xcc, %dl # Misc output register (read)
+ inb %dx, %al
+ movb $0xc2, %dl # (write)
+ andb $0x0d, %al # Preserve clock select bits and color bit
+ orb $0xe2, %al # Set correct sync polarity
+ outb %al, %dx
+ popw %dx
+ movw $0x501e, force_size
+ stc # That's all.
+ ret
+
+# Set the 80x34 mode (all VGA's). 480 scans, 14-pixel font.
+set_80x34:
+ call set_80x30 # Set 480 scans
+ call set14 # And 14-pt font
+ movw $0xdb12, %ax # VGA vertical display end
+ movw $0x5022, force_size
+setvde: call outidx
+ stc
+ ret
+
+# Set the 80x60 mode (all VGA's). 480 scans, 8-pixel font.
+set_80x60:
+ call set_80x30 # Set 480 scans
+ call set_8pt # And 8-pt font
+ movw $0xdf12, %ax # VGA vertical display end
+ movw $0x503c, force_size
+ jmp setvde
+
+# Special hack for ThinkPad graphics
+set_gfx:
+#ifdef CONFIG_VIDEO_GFX_HACK
+ movw $VIDEO_GFX_BIOS_AX, %ax
+ movw $VIDEO_GFX_BIOS_BX, %bx
+ int $0x10
+ movw $VIDEO_GFX_DUMMY_RESOLUTION, force_size
+ stc
+#endif
+ ret
+
+#ifdef CONFIG_VIDEO_RETAIN
+
+# Store screen contents to temporary buffer.
+store_screen:
+ cmpb $0, do_restore # Already stored?
+ jnz stsr
+
+ testb $CAN_USE_HEAP, loadflags # Have we space for storing?
+ jz stsr
+
+ pushw %ax
+ pushw %bx
+ pushw force_size # Don't force specific size
+ movw $0, force_size
+ call mode_params # Obtain params of current mode
+ popw force_size
+ movb %fs:(PARAM_VIDEO_LINES), %ah
+ movb %fs:(PARAM_VIDEO_COLS), %al
+ movw %ax, %bx # BX=dimensions
+ mulb %ah
+ movw %ax, %cx # CX=number of characters
+ addw %ax, %ax # Calculate image size
+ addw $modelist+1024+4, %ax
+ cmpw heap_end_ptr, %ax
+ jnc sts1 # Unfortunately, out of memory
+
+ movw %fs:(PARAM_CURSOR_POS), %ax # Store mode params
+ leaw modelist+1024, %di
+ stosw
+ movw %bx, %ax
+ stosw
+ pushw %ds # Store the screen
+ movw video_segment, %ds
+ xorw %si, %si
+ rep
+ movsw
+ popw %ds
+ incb do_restore # Screen will be restored later
+sts1: popw %bx
+ popw %ax
+stsr: ret
+
+# Restore screen contents from temporary buffer.
+restore_screen:
+ cmpb $0, do_restore # Has the screen been stored?
+ jz res1
+
+ call mode_params # Get parameters of current mode
+ movb %fs:(PARAM_VIDEO_LINES), %cl
+ movb %fs:(PARAM_VIDEO_COLS), %ch
+ leaw modelist+1024, %si # Screen buffer
+ lodsw # Set cursor position
+ movw %ax, %dx
+ cmpb %cl, %dh
+ jc res2
+
+ movb %cl, %dh
+ decb %dh
+res2: cmpb %ch, %dl
+ jc res3
+
+ movb %ch, %dl
+ decb %dl
+res3: movb $0x02, %ah
+ movb $0x00, %bh
+ int $0x10
+ lodsw # Display size
+ movb %ah, %dl # DL=number of lines
+ movb $0, %ah # BX=phys. length of orig. line
+ movw %ax, %bx
+ cmpb %cl, %dl # Too many?
+ jc res4
+
+ pushw %ax
+ movb %dl, %al
+ subb %cl, %al
+ mulb %bl
+ addw %ax, %si
+ addw %ax, %si
+ popw %ax
+ movb %cl, %dl
+res4: cmpb %ch, %al # Too wide?
+ jc res5
+
+ movb %ch, %al # AX=width of src. line
+res5: movb $0, %cl
+ xchgb %ch, %cl
+ movw %cx, %bp # BP=width of dest. line
+ pushw %es
+ movw video_segment, %es
+ xorw %di, %di # Move the data
+ addw %bx, %bx # Convert BX and BP to _bytes_
+ addw %bp, %bp
+res6: pushw %si
+ pushw %di
+ movw %ax, %cx
+ rep
+ movsw
+ popw %di
+ popw %si
+ addw %bp, %di
+ addw %bx, %si
+ decb %dl
+ jnz res6
+
+ popw %es # Done
+res1: ret
+#endif /* CONFIG_VIDEO_RETAIN */
+
+# Write to indexed VGA register (AL=index, AH=data, DX=index reg. port)
+outidx: outb %al, %dx
+ pushw %ax
+ movb %ah, %al
+ incw %dx
+ outb %al, %dx
+ decw %dx
+ popw %ax
+ ret
+
+# Build the table of video modes (stored after the setup.S code at the
+# `modelist' label. Each video mode record looks like:
+# .word MODE-ID (our special mode ID (see above))
+# .byte rows (number of rows)
+# .byte columns (number of columns)
+# Returns address of the end of the table in DI, the end is marked
+# with a ASK_VGA ID.
+mode_table:
+ movw mt_end, %di # Already filled?
+ orw %di, %di
+ jnz mtab1x
+
+ leaw modelist, %di # Store standard modes:
+ movl $VIDEO_80x25 + 0x50190000, %eax # The 80x25 mode (ALL)
+ stosl
+ movb adapter, %al # CGA/MDA/HGA -- no more modes
+ orb %al, %al
+ jz mtabe
+
+ decb %al
+ jnz mtabv
+
+ movl $VIDEO_8POINT + 0x502b0000, %eax # The 80x43 EGA mode
+ stosl
+ jmp mtabe
+
+mtab1x: jmp mtab1
+
+mtabv: leaw vga_modes, %si # All modes for std VGA
+ movw $vga_modes_end-vga_modes, %cx
+ rep # I'm unable to use movsw as I don't know how to store a half
+ movsb # of the expression above to cx without using explicit shr.
+
+ cmpb $0, scanning # Mode scan requested?
+ jz mscan1
+
+ call mode_scan
+mscan1:
+
+#ifdef CONFIG_VIDEO_LOCAL
+ call local_modes
+#endif /* CONFIG_VIDEO_LOCAL */
+
+#ifdef CONFIG_VIDEO_VESA
+ call vesa_modes # Detect VESA VGA modes
+#endif /* CONFIG_VIDEO_VESA */
+
+#ifdef CONFIG_VIDEO_SVGA
+ cmpb $0, scanning # Bypass when scanning
+ jnz mscan2
+
+ call svga_modes # Detect SVGA cards & modes
+mscan2:
+#endif /* CONFIG_VIDEO_SVGA */
+
+mtabe:
+
+#ifdef CONFIG_VIDEO_COMPACT
+ leaw modelist, %si
+ movw %di, %dx
+ movw %si, %di
+cmt1: cmpw %dx, %si # Scan all modes
+ jz cmt2
+
+ leaw modelist, %bx # Find in previous entries
+ movw 2(%si), %cx
+cmt3: cmpw %bx, %si
+ jz cmt4
+
+ cmpw 2(%bx), %cx # Found => don't copy this entry
+ jz cmt5
+
+ addw $4, %bx
+ jmp cmt3
+
+cmt4: movsl # Copy entry
+ jmp cmt1
+
+cmt5: addw $4, %si # Skip entry
+ jmp cmt1
+
+cmt2:
+#endif /* CONFIG_VIDEO_COMPACT */
+
+ movw $ASK_VGA, (%di) # End marker
+ movw %di, mt_end
+mtab1: leaw modelist, %si # SI=mode list, DI=list end
+ret0: ret
+
+# Modes usable on all standard VGAs
+vga_modes:
+ .word VIDEO_8POINT
+ .word 0x5032 # 80x50
+ .word VIDEO_80x43
+ .word 0x502b # 80x43
+ .word VIDEO_80x28
+ .word 0x501c # 80x28
+ .word VIDEO_80x30
+ .word 0x501e # 80x30
+ .word VIDEO_80x34
+ .word 0x5022 # 80x34
+ .word VIDEO_80x60
+ .word 0x503c # 80x60
+#ifdef CONFIG_VIDEO_GFX_HACK
+ .word VIDEO_GFX_HACK
+ .word VIDEO_GFX_DUMMY_RESOLUTION
+#endif
+
+vga_modes_end:
+# Detect VESA modes.
+
+#ifdef CONFIG_VIDEO_VESA
+vesa_modes:
+ cmpb $2, adapter # VGA only
+ jnz ret0
+
+ movw %di, %bp # BP=original mode table end
+ addw $0x200, %di # Buffer space
+ movw $0x4f00, %ax # VESA Get card info call
+ int $0x10
+ movw %bp, %di
+ cmpw $0x004f, %ax # Successful?
+ jnz ret0
+
+ cmpw $0x4556, 0x200(%di)
+ jnz ret0
+
+ cmpw $0x4153, 0x202(%di)
+ jnz ret0
+
+ movw $vesa_name, card_name # Set name to "VESA VGA"
+ pushw %gs
+ lgsw 0x20e(%di), %si # GS:SI=mode list
+ movw $128, %cx # Iteration limit
+vesa1:
+# gas version 2.9.1, using BFD version 2.9.1.0.23 buggers the next inst.
+# XXX: lodsw %gs:(%si), %ax # Get next mode in the list
+ gs; lodsw
+ cmpw $0xffff, %ax # End of the table?
+ jz vesar
+
+ cmpw $0x0080, %ax # Check validity of mode ID
+ jc vesa2
+
+ orb %ah, %ah # Valid IDs: 0x0000-0x007f/0x0100-0x07ff
+ jz vesan # Certain BIOSes report 0x80-0xff!
+
+ cmpw $0x0800, %ax
+ jnc vesae
+
+vesa2: pushw %cx
+ movw %ax, %cx # Get mode information structure
+ movw $0x4f01, %ax
+ int $0x10
+ movw %cx, %bx # BX=mode number
+ addb $VIDEO_FIRST_VESA>>8, %bh
+ popw %cx
+ cmpw $0x004f, %ax
+ jnz vesan # Don't report errors (buggy BIOSES)
+
+ movb (%di), %al # Check capabilities. We require
+ andb $0x19, %al # a color text mode.
+ cmpb $0x09, %al
+ jnz vesan
+
+ cmpw $0xb800, 8(%di) # Standard video memory address required
+ jnz vesan
+
+ testb $2, (%di) # Mode characteristics supplied?
+ movw %bx, (%di) # Store mode number
+ jz vesa3
+
+ xorw %dx, %dx
+ movw 0x12(%di), %bx # Width
+ orb %bh, %bh
+ jnz vesan
+
+ movb %bl, 0x3(%di)
+ movw 0x14(%di), %ax # Height
+ orb %ah, %ah
+ jnz vesan
+
+ movb %al, 2(%di)
+ mulb %bl
+ cmpw $8193, %ax # Small enough for Linux console driver?
+ jnc vesan
+
+ jmp vesaok
+
+vesa3: subw $0x8108, %bx # This mode has no detailed info specified,
+ jc vesan # so it must be a standard VESA mode.
+
+ cmpw $5, %bx
+ jnc vesan
+
+ movw vesa_text_mode_table(%bx), %ax
+ movw %ax, 2(%di)
+vesaok: addw $4, %di # The mode is valid. Store it.
+vesan: loop vesa1 # Next mode. Limit exceeded => error
+vesae: leaw vesaer, %si
+ call prtstr
+ movw %bp, %di # Discard already found modes.
+vesar: popw %gs
+ ret
+
+# Dimensions of standard VESA text modes
+vesa_text_mode_table:
+ .byte 60, 80 # 0108
+ .byte 25, 132 # 0109
+ .byte 43, 132 # 010A
+ .byte 50, 132 # 010B
+ .byte 60, 132 # 010C
+#endif /* CONFIG_VIDEO_VESA */
+
+# Scan for video modes. A bit dirty, but should work.
+mode_scan:
+ movw $0x0100, %cx # Start with mode 0
+scm1: movb $0, %ah # Test the mode
+ movb %cl, %al
+ int $0x10
+ movb $0x0f, %ah
+ int $0x10
+ cmpb %cl, %al
+ jnz scm2 # Mode not set
+
+ movw $0x3c0, %dx # Test if it's a text mode
+ movb $0x10, %al # Mode bits
+ call inidx
+ andb $0x03, %al
+ jnz scm2
+
+ movb $0xce, %dl # Another set of mode bits
+ movb $0x06, %al
+ call inidx
+ shrb %al
+ jc scm2
+
+ movb $0xd4, %dl # Cursor location
+ movb $0x0f, %al
+ call inidx
+ orb %al, %al
+ jnz scm2
+
+ movw %cx, %ax # Ok, store the mode
+ stosw
+ movb %gs:(0x484), %al # Number of rows
+ incb %al
+ stosb
+ movw %gs:(0x44a), %ax # Number of columns
+ stosb
+scm2: incb %cl
+ jns scm1
+
+ movw $0x0003, %ax # Return back to mode 3
+ int $0x10
+ ret
+
+tstidx: outw %ax, %dx # OUT DX,AX and inidx
+inidx: outb %al, %dx # Read from indexed VGA register
+ incw %dx # AL=index, DX=index reg port -> AL=data
+ inb %dx, %al
+ decw %dx
+ ret
+
+# Try to detect type of SVGA card and supply (usually approximate) video
+# mode table for it.
+
+#ifdef CONFIG_VIDEO_SVGA
+svga_modes:
+ leaw svga_table, %si # Test all known SVGA adapters
+dosvga: lodsw
+ movw %ax, %bp # Default mode table
+ orw %ax, %ax
+ jz didsv1
+
+ lodsw # Pointer to test routine
+ pushw %si
+ pushw %di
+ pushw %es
+ movw $0xc000, %bx
+ movw %bx, %es
+ call *%ax # Call test routine
+ popw %es
+ popw %di
+ popw %si
+ orw %bp, %bp
+ jz dosvga
+
+ movw %bp, %si # Found, copy the modes
+ movb svga_prefix, %ah
+cpsvga: lodsb
+ orb %al, %al
+ jz didsv
+
+ stosw
+ movsw
+ jmp cpsvga
+
+didsv: movw %si, card_name # Store pointer to card name
+didsv1: ret
+
+# Table of all known SVGA cards. For each card, we store a pointer to
+# a table of video modes supported by the card and a pointer to a routine
+# used for testing of presence of the card. The video mode table is always
+# followed by the name of the card or the chipset.
+svga_table:
+ .word ati_md, ati_test
+ .word oak_md, oak_test
+ .word paradise_md, paradise_test
+ .word realtek_md, realtek_test
+ .word s3_md, s3_test
+ .word chips_md, chips_test
+ .word video7_md, video7_test
+ .word cirrus5_md, cirrus5_test
+ .word cirrus6_md, cirrus6_test
+ .word cirrus1_md, cirrus1_test
+ .word ahead_md, ahead_test
+ .word everex_md, everex_test
+ .word genoa_md, genoa_test
+ .word trident_md, trident_test
+ .word tseng_md, tseng_test
+ .word 0
+
+# Test routines and mode tables:
+
+# S3 - The test algorithm was taken from the SuperProbe package
+# for XFree86 1.2.1. Report bugs to Christoph.Niemann@linux.org
+s3_test:
+ movw $0x0f35, %cx # we store some constants in cl/ch
+ movw $0x03d4, %dx
+ movb $0x38, %al
+ call inidx
+ movb %al, %bh # store current CRT-register 0x38
+ movw $0x0038, %ax
+ call outidx # disable writing to special regs
+ movb %cl, %al # check whether we can write special reg 0x35
+ call inidx
+ movb %al, %bl # save the current value of CRT reg 0x35
+ andb $0xf0, %al # clear bits 0-3
+ movb %al, %ah
+ movb %cl, %al # and write it to CRT reg 0x35
+ call outidx
+ call inidx # now read it back
+ andb %ch, %al # clear the upper 4 bits
+ jz s3_2 # the first test failed. But we have a
+
+ movb %bl, %ah # second chance
+ movb %cl, %al
+ call outidx
+ jmp s3_1 # do the other tests
+
+s3_2: movw %cx, %ax # load ah with 0xf and al with 0x35
+ orb %bl, %ah # set the upper 4 bits of ah with the orig value
+ call outidx # write ...
+ call inidx # ... and reread
+ andb %cl, %al # turn off the upper 4 bits
+ pushw %ax
+ movb %bl, %ah # restore old value in register 0x35
+ movb %cl, %al
+ call outidx
+ popw %ax
+ cmpb %ch, %al # setting lower 4 bits was successful => bad
+ je no_s3 # writing is allowed => this is not an S3
+
+s3_1: movw $0x4838, %ax # allow writing to special regs by putting
+ call outidx # magic number into CRT-register 0x38
+ movb %cl, %al # check whether we can write special reg 0x35
+ call inidx
+ movb %al, %bl
+ andb $0xf0, %al
+ movb %al, %ah
+ movb %cl, %al
+ call outidx
+ call inidx
+ andb %ch, %al
+ jnz no_s3 # no, we can't write => no S3
+
+ movw %cx, %ax
+ orb %bl, %ah
+ call outidx
+ call inidx
+ andb %ch, %al
+ pushw %ax
+ movb %bl, %ah # restore old value in register 0x35
+ movb %cl, %al
+ call outidx
+ popw %ax
+ cmpb %ch, %al
+ jne no_s31 # writing not possible => no S3
+ movb $0x30, %al
+ call inidx # now get the S3 id ...
+ leaw idS3, %di
+ movw $0x10, %cx
+ repne
+ scasb
+ je no_s31
+
+ movb %bh, %ah
+ movb $0x38, %al
+ jmp s3rest
+
+no_s3: movb $0x35, %al # restore CRT register 0x35
+ movb %bl, %ah
+ call outidx
+no_s31: xorw %bp, %bp # Detection failed
+s3rest: movb %bh, %ah
+ movb $0x38, %al # restore old value of CRT register 0x38
+ jmp outidx
+
+idS3: .byte 0x81, 0x82, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95
+ .byte 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa8, 0xb0
+
+s3_md: .byte 0x54, 0x2b, 0x84
+ .byte 0x55, 0x19, 0x84
+ .byte 0
+ .ascii "S3"
+ .byte 0
+
+# ATI cards.
+ati_test:
+ leaw idati, %si
+ movw $0x31, %di
+ movw $0x09, %cx
+ repe
+ cmpsb
+ je atiok
+
+ xorw %bp, %bp
+atiok: ret
+
+idati: .ascii "761295520"
+
+ati_md: .byte 0x23, 0x19, 0x84
+ .byte 0x33, 0x2c, 0x84
+ .byte 0x22, 0x1e, 0x64
+ .byte 0x21, 0x19, 0x64
+ .byte 0x58, 0x21, 0x50
+ .byte 0x5b, 0x1e, 0x50
+ .byte 0
+ .ascii "ATI"
+ .byte 0
+
+# AHEAD
+ahead_test:
+ movw $0x200f, %ax
+ movw $0x3ce, %dx
+ outw %ax, %dx
+ incw %dx
+ inb %dx, %al
+ cmpb $0x20, %al
+ je isahed
+
+ cmpb $0x21, %al
+ je isahed
+
+ xorw %bp, %bp
+isahed: ret
+
+ahead_md:
+ .byte 0x22, 0x2c, 0x84
+ .byte 0x23, 0x19, 0x84
+ .byte 0x24, 0x1c, 0x84
+ .byte 0x2f, 0x32, 0xa0
+ .byte 0x32, 0x22, 0x50
+ .byte 0x34, 0x42, 0x50
+ .byte 0
+ .ascii "Ahead"
+ .byte 0
+
+# Chips & Tech.
+chips_test:
+ movw $0x3c3, %dx
+ inb %dx, %al
+ orb $0x10, %al
+ outb %al, %dx
+ movw $0x104, %dx
+ inb %dx, %al
+ movb %al, %bl
+ movw $0x3c3, %dx
+ inb %dx, %al
+ andb $0xef, %al
+ outb %al, %dx
+ cmpb $0xa5, %bl
+ je cantok
+
+ xorw %bp, %bp
+cantok: ret
+
+chips_md:
+ .byte 0x60, 0x19, 0x84
+ .byte 0x61, 0x32, 0x84
+ .byte 0
+ .ascii "Chips & Technologies"
+ .byte 0
+
+# Cirrus Logic 5X0
+cirrus1_test:
+ movw $0x3d4, %dx
+ movb $0x0c, %al
+ outb %al, %dx
+ incw %dx
+ inb %dx, %al
+ movb %al, %bl
+ xorb %al, %al
+ outb %al, %dx
+ decw %dx
+ movb $0x1f, %al
+ outb %al, %dx
+ incw %dx
+ inb %dx, %al
+ movb %al, %bh
+ xorb %ah, %ah
+ shlb $4, %al
+ movw %ax, %cx
+ movb %bh, %al
+ shrb $4, %al
+ addw %ax, %cx
+ shlw $8, %cx
+ addw $6, %cx
+ movw %cx, %ax
+ movw $0x3c4, %dx
+ outw %ax, %dx
+ incw %dx
+ inb %dx, %al
+ andb %al, %al
+ jnz nocirr
+
+ movb %bh, %al
+ outb %al, %dx
+ inb %dx, %al
+ cmpb $0x01, %al
+ je iscirr
+
+nocirr: xorw %bp, %bp
+iscirr: movw $0x3d4, %dx
+ movb %bl, %al
+ xorb %ah, %ah
+ shlw $8, %ax
+ addw $0x0c, %ax
+ outw %ax, %dx
+ ret
+
+cirrus1_md:
+ .byte 0x1f, 0x19, 0x84
+ .byte 0x20, 0x2c, 0x84
+ .byte 0x22, 0x1e, 0x84
+ .byte 0x31, 0x25, 0x64
+ .byte 0
+ .ascii "Cirrus Logic 5X0"
+ .byte 0
+
+# Cirrus Logic 54XX
+cirrus5_test:
+ movw $0x3c4, %dx
+ movb $6, %al
+ call inidx
+ movb %al, %bl # BL=backup
+ movw $6, %ax
+ call tstidx
+ cmpb $0x0f, %al
+ jne c5fail
+
+ movw $0x1206, %ax
+ call tstidx
+ cmpb $0x12, %al
+ jne c5fail
+
+ movb $0x1e, %al
+ call inidx
+ movb %al, %bh
+ movb %bh, %ah
+ andb $0xc0, %ah
+ movb $0x1e, %al
+ call tstidx
+ andb $0x3f, %al
+ jne c5xx
+
+ movb $0x1e, %al
+ movb %bh, %ah
+ orb $0x3f, %ah
+ call tstidx
+ xorb $0x3f, %al
+ andb $0x3f, %al
+c5xx: pushf
+ movb $0x1e, %al
+ movb %bh, %ah
+ outw %ax, %dx
+ popf
+ je c5done
+
+c5fail: xorw %bp, %bp
+c5done: movb $6, %al
+ movb %bl, %ah
+ outw %ax, %dx
+ ret
+
+cirrus5_md:
+ .byte 0x14, 0x19, 0x84
+ .byte 0x54, 0x2b, 0x84
+ .byte 0
+ .ascii "Cirrus Logic 54XX"
+ .byte 0
+
+# Cirrus Logic 64XX -- no known extra modes, but must be identified, because
+# it's misidentified by the Ahead test.
+cirrus6_test:
+ movw $0x3ce, %dx
+ movb $0x0a, %al
+ call inidx
+ movb %al, %bl # BL=backup
+ movw $0xce0a, %ax
+ call tstidx
+ orb %al, %al
+ jne c2fail
+
+ movw $0xec0a, %ax
+ call tstidx
+ cmpb $0x01, %al
+ jne c2fail
+
+ movb $0xaa, %al
+ call inidx # 4X, 5X, 7X and 8X are valid 64XX chip ID's.
+ shrb $4, %al
+ subb $4, %al
+ jz c6done
+
+ decb %al
+ jz c6done
+
+ subb $2, %al
+ jz c6done
+
+ decb %al
+ jz c6done
+
+c2fail: xorw %bp, %bp
+c6done: movb $0x0a, %al
+ movb %bl, %ah
+ outw %ax, %dx
+ ret
+
+cirrus6_md:
+ .byte 0
+ .ascii "Cirrus Logic 64XX"
+ .byte 0
+
+# Everex / Trident
+everex_test:
+ movw $0x7000, %ax
+ xorw %bx, %bx
+ int $0x10
+ cmpb $0x70, %al
+ jne noevrx
+
+ shrw $4, %dx
+ cmpw $0x678, %dx
+ je evtrid
+
+ cmpw $0x236, %dx
+ jne evrxok
+
+evtrid: leaw trident_md, %bp
+evrxok: ret
+
+noevrx: xorw %bp, %bp
+ ret
+
+everex_md:
+ .byte 0x03, 0x22, 0x50
+ .byte 0x04, 0x3c, 0x50
+ .byte 0x07, 0x2b, 0x64
+ .byte 0x08, 0x4b, 0x64
+ .byte 0x0a, 0x19, 0x84
+ .byte 0x0b, 0x2c, 0x84
+ .byte 0x16, 0x1e, 0x50
+ .byte 0x18, 0x1b, 0x64
+ .byte 0x21, 0x40, 0xa0
+ .byte 0x40, 0x1e, 0x84
+ .byte 0
+ .ascii "Everex/Trident"
+ .byte 0
+
+# Genoa.
+genoa_test:
+ leaw idgenoa, %si # Check Genoa 'clues'
+ xorw %ax, %ax
+ movb %es:(0x37), %al
+ movw %ax, %di
+ movw $0x04, %cx
+ decw %si
+ decw %di
+l1: incw %si
+ incw %di
+ movb (%si), %al
+ testb %al, %al
+ jz l2
+
+ cmpb %es:(%di), %al
+l2: loope l1
+ orw %cx, %cx
+ je isgen
+
+ xorw %bp, %bp
+isgen: ret
+
+idgenoa: .byte 0x77, 0x00, 0x99, 0x66
+
+genoa_md:
+ .byte 0x58, 0x20, 0x50
+ .byte 0x5a, 0x2a, 0x64
+ .byte 0x60, 0x19, 0x84
+ .byte 0x61, 0x1d, 0x84
+ .byte 0x62, 0x20, 0x84
+ .byte 0x63, 0x2c, 0x84
+ .byte 0x64, 0x3c, 0x84
+ .byte 0x6b, 0x4f, 0x64
+ .byte 0x72, 0x3c, 0x50
+ .byte 0x74, 0x42, 0x50
+ .byte 0x78, 0x4b, 0x64
+ .byte 0
+ .ascii "Genoa"
+ .byte 0
+
+# OAK
+oak_test:
+ leaw idoakvga, %si
+ movw $0x08, %di
+ movw $0x08, %cx
+ repe
+ cmpsb
+ je isoak
+
+ xorw %bp, %bp
+isoak: ret
+
+idoakvga: .ascii "OAK VGA "
+
+oak_md: .byte 0x4e, 0x3c, 0x50
+ .byte 0x4f, 0x3c, 0x84
+ .byte 0x50, 0x19, 0x84
+ .byte 0x51, 0x2b, 0x84
+ .byte 0
+ .ascii "OAK"
+ .byte 0
+
+# WD Paradise.
+paradise_test:
+ leaw idparadise, %si
+ movw $0x7d, %di
+ movw $0x04, %cx
+ repe
+ cmpsb
+ je ispara
+
+ xorw %bp, %bp
+ispara: ret
+
+idparadise: .ascii "VGA="
+
+paradise_md:
+ .byte 0x41, 0x22, 0x50
+ .byte 0x47, 0x1c, 0x84
+ .byte 0x55, 0x19, 0x84
+ .byte 0x54, 0x2c, 0x84
+ .byte 0
+ .ascii "Paradise"
+ .byte 0
+
+# Trident.
+trident_test:
+ movw $0x3c4, %dx
+ movb $0x0e, %al
+ outb %al, %dx
+ incw %dx
+ inb %dx, %al
+ xchgb %al, %ah
+ xorb %al, %al
+ outb %al, %dx
+ inb %dx, %al
+ xchgb %ah, %al
+ movb %al, %bl # Strange thing ... in the book this wasn't
+ andb $0x02, %bl # necessary but it worked on my card which
+ jz setb2 # is a trident. Without it the screen goes
+ # blurred ...
+ andb $0xfd, %al
+ jmp clrb2
+
+setb2: orb $0x02, %al
+clrb2: outb %al, %dx
+ andb $0x0f, %ah
+ cmpb $0x02, %ah
+ je istrid
+
+ xorw %bp, %bp
+istrid: ret
+
+trident_md:
+ .byte 0x50, 0x1e, 0x50
+ .byte 0x51, 0x2b, 0x50
+ .byte 0x52, 0x3c, 0x50
+ .byte 0x57, 0x19, 0x84
+ .byte 0x58, 0x1e, 0x84
+ .byte 0x59, 0x2b, 0x84
+ .byte 0x5a, 0x3c, 0x84
+ .byte 0
+ .ascii "Trident"
+ .byte 0
+
+# Tseng.
+tseng_test:
+ movw $0x3cd, %dx
+ inb %dx, %al # Could things be this simple ! :-)
+ movb %al, %bl
+ movb $0x55, %al
+ outb %al, %dx
+ inb %dx, %al
+ movb %al, %ah
+ movb %bl, %al
+ outb %al, %dx
+ cmpb $0x55, %ah
+ je istsen
+
+isnot: xorw %bp, %bp
+istsen: ret
+
+tseng_md:
+ .byte 0x26, 0x3c, 0x50
+ .byte 0x2a, 0x28, 0x64
+ .byte 0x23, 0x19, 0x84
+ .byte 0x24, 0x1c, 0x84
+ .byte 0x22, 0x2c, 0x84
+ .byte 0x21, 0x3c, 0x84
+ .byte 0
+ .ascii "Tseng"
+ .byte 0
+
+# Video7.
+video7_test:
+ movw $0x3cc, %dx
+ inb %dx, %al
+ movw $0x3b4, %dx
+ andb $0x01, %al
+ jz even7
+
+ movw $0x3d4, %dx
+even7: movb $0x0c, %al
+ outb %al, %dx
+ incw %dx
+ inb %dx, %al
+ movb %al, %bl
+ movb $0x55, %al
+ outb %al, %dx
+ inb %dx, %al
+ decw %dx
+ movb $0x1f, %al
+ outb %al, %dx
+ incw %dx
+ inb %dx, %al
+ movb %al, %bh
+ decw %dx
+ movb $0x0c, %al
+ outb %al, %dx
+ incw %dx
+ movb %bl, %al
+ outb %al, %dx
+ movb $0x55, %al
+ xorb $0xea, %al
+ cmpb %bh, %al
+ jne isnot
+
+ movb $VIDEO_FIRST_V7>>8, svga_prefix # Use special mode switching
+ ret
+
+video7_md:
+ .byte 0x40, 0x2b, 0x50
+ .byte 0x43, 0x3c, 0x50
+ .byte 0x44, 0x3c, 0x64
+ .byte 0x41, 0x19, 0x84
+ .byte 0x42, 0x2c, 0x84
+ .byte 0x45, 0x1c, 0x84
+ .byte 0
+ .ascii "Video 7"
+ .byte 0
+
+# Realtek VGA
+realtek_test:
+ leaw idrtvga, %si
+ movw $0x45, %di
+ movw $0x0b, %cx
+ repe
+ cmpsb
+ je isrt
+
+ xorw %bp, %bp
+isrt: ret
+
+idrtvga: .ascii "REALTEK VGA"
+
+realtek_md:
+ .byte 0x1a, 0x3c, 0x50
+ .byte 0x1b, 0x19, 0x84
+ .byte 0x1c, 0x1e, 0x84
+ .byte 0x1d, 0x2b, 0x84
+ .byte 0x1e, 0x3c, 0x84
+ .byte 0
+ .ascii "REALTEK"
+ .byte 0
+
+#endif /* CONFIG_VIDEO_SVGA */
+
+# User-defined local mode table (VGA only)
+#ifdef CONFIG_VIDEO_LOCAL
+local_modes:
+ leaw local_mode_table, %si
+locm1: lodsw
+ orw %ax, %ax
+ jz locm2
+
+ stosw
+ movsw
+ jmp locm1
+
+locm2: ret
+
+# This is the table of local video modes which can be supplied manually
+# by the user. Each entry consists of mode ID (word) and dimensions
+# (byte for column count and another byte for row count). These modes
+# are placed before all SVGA and VESA modes and override them if table
+# compacting is enabled. The table must end with a zero word followed
+# by NUL-terminated video adapter name.
+local_mode_table:
+ .word 0x0100 # Example: 40x25
+ .byte 25,40
+ .word 0
+ .ascii "Local"
+ .byte 0
+#endif /* CONFIG_VIDEO_LOCAL */
+
+# Read a key and return the ASCII code in al, scan code in ah
+getkey: xorb %ah, %ah
+ int $0x16
+ ret
+
+# Read a key with a timeout of 30 seconds.
+# The hardware clock is used to get the time.
+getkt: call gettime
+ addb $30, %al # Wait 30 seconds
+ cmpb $60, %al
+ jl lminute
+
+ subb $60, %al
+lminute:
+ movb %al, %cl
+again: movb $0x01, %ah
+ int $0x16
+ jnz getkey # key pressed, so get it
+
+ call gettime
+ cmpb %cl, %al
+ jne again
+
+ movb $0x20, %al # timeout, return `space'
+ ret
+
+# Flush the keyboard buffer
+flush: movb $0x01, %ah
+ int $0x16
+ jz empty
+
+ xorb %ah, %ah
+ int $0x16
+ jmp flush
+
+empty: ret
+
+# Print hexadecimal number.
+prthw: pushw %ax
+ movb %ah, %al
+ call prthb
+ popw %ax
+prthb: pushw %ax
+ shrb $4, %al
+ call prthn
+ popw %ax
+ andb $0x0f, %al
+prthn: cmpb $0x0a, %al
+ jc prth1
+
+ addb $0x07, %al
+prth1: addb $0x30, %al
+ jmp prtchr
+
+# Print decimal number in al
+prtdec: pushw %ax
+ pushw %cx
+ xorb %ah, %ah
+ movb $0x0a, %cl
+ idivb %cl
+ cmpb $0x09, %al
+ jbe lt100
+
+ call prtdec
+ jmp skip10
+
+lt100: addb $0x30, %al
+ call prtchr
+skip10: movb %ah, %al
+ addb $0x30, %al
+ call prtchr
+ popw %cx
+ popw %ax
+ ret
+
+store_edid:
+ pushw %es # just save all registers
+ pushw %ax
+ pushw %bx
+ pushw %cx
+ pushw %dx
+ pushw %di
+
+ pushw %fs
+ popw %es
+
+ movl $0x13131313, %eax # memset block with 0x13
+ movw $32, %cx
+ movw $0x140, %di
+ cld
+ rep
+ stosl
+
+ movw $0x4f15, %ax # do VBE/DDC
+ movw $0x01, %bx
+ movw $0x00, %cx
+ movw $0x01, %dx
+ movw $0x140, %di
+ int $0x10
+
+ popw %di # restore all registers
+ popw %dx
+ popw %cx
+ popw %bx
+ popw %ax
+ popw %es
+ ret
+
+# VIDEO_SELECT-only variables
+mt_end: .word 0 # End of video mode table if built
+edit_buf: .space 6 # Line editor buffer
+card_name: .word 0 # Pointer to adapter name
+scanning: .byte 0 # Performing mode scan
+do_restore: .byte 0 # Screen contents altered during mode change
+svga_prefix: .byte VIDEO_FIRST_BIOS>>8 # Default prefix for BIOS modes
+graphic_mode: .byte 0 # Graphic mode with a linear frame buffer
+dac_size: .byte 6 # DAC bit depth
+
+# Status messages
+keymsg: .ascii "Press <RETURN> to see video modes available, "
+ .ascii "<SPACE> to continue or wait 30 secs"
+ .byte 0x0d, 0x0a, 0
+
+listhdr: .byte 0x0d, 0x0a
+ .ascii "Mode: COLSxROWS:"
+
+crlft: .byte 0x0d, 0x0a, 0
+
+prompt: .byte 0x0d, 0x0a
+ .asciz "Enter mode number or `scan': "
+
+unknt: .asciz "Unknown mode ID. Try again."
+
+badmdt: .ascii "You passed an undefined mode number."
+ .byte 0x0d, 0x0a, 0
+
+vesaer: .ascii "Error: Scanning of VESA modes failed. Please "
+ .ascii "report to <mj@ucw.cz>."
+ .byte 0x0d, 0x0a, 0
+
+old_name: .asciz "CGA/MDA/HGA"
+
+ega_name: .asciz "EGA"
+
+svga_name: .ascii " "
+
+vga_name: .asciz "VGA"
+
+vesa_name: .asciz "VESA"
+
+name_bann: .asciz "Video adapter: "
+#endif /* CONFIG_VIDEO_SELECT */
+
+# Other variables:
+adapter: .byte 0 # Video adapter: 0=CGA/MDA/HGA,1=EGA,2=VGA
+video_segment: .word 0xb800 # Video memory segment
+force_size: .word 0 # Use this size instead of the one in BIOS vars
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
new file mode 100644
index 000000000000..9ce51dee30b3
--- /dev/null
+++ b/arch/x86_64/defconfig
@@ -0,0 +1,1129 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.11-bk7
+# Sat Mar 12 23:43:44 2005
+#
+CONFIG_X86_64=y
+CONFIG_64BIT=y
+CONFIG_X86=y
+CONFIG_MMU=y
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_X86_CMPXCHG=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_HPET_TIMER=y
+CONFIG_HPET_EMULATE_RTC=y
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_GENERIC_IOMAP=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_CLEAN_COMPILE=y
+CONFIG_LOCK_KERNEL=y
+
+#
+# General setup
+#
+CONFIG_LOCALVERSION=""
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+# CONFIG_BSD_PROCESS_ACCT is not set
+CONFIG_SYSCTL=y
+# CONFIG_AUDIT is not set
+CONFIG_LOG_BUF_SHIFT=18
+# CONFIG_HOTPLUG is not set
+CONFIG_KOBJECT_UEVENT=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+# CONFIG_CPUSETS is not set
+# CONFIG_EMBEDDED is not set
+CONFIG_KALLSYMS=y
+CONFIG_KALLSYMS_ALL=y
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+CONFIG_SHMEM=y
+CONFIG_CC_ALIGN_FUNCTIONS=0
+CONFIG_CC_ALIGN_LABELS=0
+CONFIG_CC_ALIGN_LOOPS=0
+CONFIG_CC_ALIGN_JUMPS=0
+# CONFIG_TINY_SHMEM is not set
+CONFIG_BASE_SMALL=0
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_FORCE_UNLOAD=y
+CONFIG_OBSOLETE_MODPARM=y
+# CONFIG_MODVERSIONS is not set
+# CONFIG_MODULE_SRCVERSION_ALL is not set
+# CONFIG_KMOD is not set
+CONFIG_STOP_MACHINE=y
+
+#
+# Processor type and features
+#
+# CONFIG_MK8 is not set
+# CONFIG_MPSC is not set
+CONFIG_GENERIC_CPU=y
+CONFIG_X86_L1_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_TSC=y
+CONFIG_X86_GOOD_APIC=y
+# CONFIG_MICROCODE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_X86_HT=y
+CONFIG_X86_IO_APIC=y
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_MTRR=y
+CONFIG_SMP=y
+# CONFIG_PREEMPT is not set
+CONFIG_SCHED_SMT=y
+CONFIG_K8_NUMA=y
+# CONFIG_NUMA_EMU is not set
+CONFIG_DISCONTIGMEM=y
+CONFIG_NUMA=y
+CONFIG_HAVE_DEC_LOCK=y
+CONFIG_NR_CPUS=8
+CONFIG_GART_IOMMU=y
+CONFIG_SWIOTLB=y
+CONFIG_X86_MCE=y
+CONFIG_X86_MCE_INTEL=y
+CONFIG_SECCOMP=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_IRQ_PROBE=y
+
+#
+# Power management options
+#
+CONFIG_PM=y
+# CONFIG_PM_DEBUG is not set
+CONFIG_SOFTWARE_SUSPEND=y
+CONFIG_PM_STD_PARTITION=""
+
+#
+# ACPI (Advanced Configuration and Power Interface) Support
+#
+CONFIG_ACPI=y
+CONFIG_ACPI_BOOT=y
+CONFIG_ACPI_INTERPRETER=y
+CONFIG_ACPI_SLEEP=y
+CONFIG_ACPI_SLEEP_PROC_FS=y
+CONFIG_ACPI_AC=y
+CONFIG_ACPI_BATTERY=y
+CONFIG_ACPI_BUTTON=y
+# CONFIG_ACPI_VIDEO is not set
+CONFIG_ACPI_FAN=y
+CONFIG_ACPI_PROCESSOR=y
+CONFIG_ACPI_THERMAL=y
+CONFIG_ACPI_NUMA=y
+# CONFIG_ACPI_ASUS is not set
+# CONFIG_ACPI_IBM is not set
+CONFIG_ACPI_TOSHIBA=y
+CONFIG_ACPI_BLACKLIST_YEAR=2001
+CONFIG_ACPI_DEBUG=y
+CONFIG_ACPI_BUS=y
+CONFIG_ACPI_EC=y
+CONFIG_ACPI_POWER=y
+CONFIG_ACPI_PCI=y
+CONFIG_ACPI_SYSTEM=y
+# CONFIG_ACPI_CONTAINER is not set
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_DEBUG is not set
+CONFIG_CPU_FREQ_STAT=y
+# CONFIG_CPU_FREQ_STAT_DETAILS is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_TABLE=y
+
+#
+# CPUFreq processor drivers
+#
+CONFIG_X86_POWERNOW_K8=y
+CONFIG_X86_POWERNOW_K8_ACPI=y
+# CONFIG_X86_SPEEDSTEP_CENTRINO is not set
+CONFIG_X86_ACPI_CPUFREQ=y
+
+#
+# shared options
+#
+CONFIG_X86_ACPI_CPUFREQ_PROC_INTF=y
+
+#
+# Bus options (PCI etc.)
+#
+CONFIG_PCI=y
+CONFIG_PCI_DIRECT=y
+CONFIG_PCI_MMCONFIG=y
+CONFIG_UNORDERED_IO=y
+CONFIG_PCI_MSI=y
+# CONFIG_PCI_LEGACY_PROC is not set
+# CONFIG_PCI_NAMES is not set
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+# CONFIG_PCCARD is not set
+
+#
+# PC-card bridges
+#
+
+#
+# PCI Hotplug Support
+#
+# CONFIG_HOTPLUG_PCI is not set
+
+#
+# Executable file formats / Emulations
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_BINFMT_MISC is not set
+CONFIG_IA32_EMULATION=y
+CONFIG_IA32_AOUT=y
+CONFIG_COMPAT=y
+CONFIG_SYSVIPC_COMPAT=y
+CONFIG_UID16=y
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+# CONFIG_FW_LOADER is not set
+# CONFIG_DEBUG_DRIVER is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+# CONFIG_PARPORT is not set
+
+#
+# Plug and Play support
+#
+# CONFIG_PNP is not set
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=y
+# CONFIG_BLK_DEV_CRYPTOLOOP is not set
+# CONFIG_BLK_DEV_NBD is not set
+# CONFIG_BLK_DEV_SX8 is not set
+# CONFIG_BLK_DEV_UB is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_LBD=y
+# CONFIG_CDROM_PKTCDVD is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_ATA_OVER_ETH is not set
+
+#
+# ATA/ATAPI/MFM/RLL support
+#
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+# CONFIG_BLK_DEV_IDE_SATA is not set
+# CONFIG_BLK_DEV_HD_IDE is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+CONFIG_BLK_DEV_IDECD=y
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+# CONFIG_BLK_DEV_IDESCSI is not set
+# CONFIG_IDE_TASK_IOCTL is not set
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+# CONFIG_BLK_DEV_CMD640 is not set
+CONFIG_BLK_DEV_IDEPCI=y
+# CONFIG_IDEPCI_SHARE_IRQ is not set
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_GENERIC is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+# CONFIG_BLK_DEV_RZ1000 is not set
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+# CONFIG_BLK_DEV_AEC62XX is not set
+# CONFIG_BLK_DEV_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_BLK_DEV_ATIIXP is not set
+# CONFIG_BLK_DEV_CMD64X is not set
+# CONFIG_BLK_DEV_TRIFLEX is not set
+# CONFIG_BLK_DEV_CY82C693 is not set
+# CONFIG_BLK_DEV_CS5520 is not set
+# CONFIG_BLK_DEV_CS5530 is not set
+# CONFIG_BLK_DEV_HPT34X is not set
+# CONFIG_BLK_DEV_HPT366 is not set
+# CONFIG_BLK_DEV_SC1200 is not set
+CONFIG_BLK_DEV_PIIX=y
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_PDC202XX_OLD is not set
+# CONFIG_BLK_DEV_PDC202XX_NEW is not set
+# CONFIG_BLK_DEV_SVWKS is not set
+# CONFIG_BLK_DEV_SIIMAGE is not set
+# CONFIG_BLK_DEV_SIS5513 is not set
+# CONFIG_BLK_DEV_SLC90E66 is not set
+# CONFIG_BLK_DEV_TRM290 is not set
+# CONFIG_BLK_DEV_VIA82CXXX is not set
+# CONFIG_IDE_ARM is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_IVB is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+CONFIG_SCSI=y
+# CONFIG_SCSI_PROC_FS is not set
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=y
+# CONFIG_CHR_DEV_ST is not set
+# CONFIG_CHR_DEV_OSST is not set
+# CONFIG_BLK_DEV_SR is not set
+# CONFIG_CHR_DEV_SG is not set
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+# CONFIG_SCSI_MULTI_LUN is not set
+# CONFIG_SCSI_CONSTANTS is not set
+# CONFIG_SCSI_LOGGING is not set
+
+#
+# SCSI Transport Attributes
+#
+# CONFIG_SCSI_SPI_ATTRS is not set
+# CONFIG_SCSI_FC_ATTRS is not set
+# CONFIG_SCSI_ISCSI_ATTRS is not set
+
+#
+# SCSI low-level drivers
+#
+CONFIG_BLK_DEV_3W_XXXX_RAID=y
+# CONFIG_SCSI_3W_9XXX is not set
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+# CONFIG_SCSI_AIC7XXX_OLD is not set
+CONFIG_SCSI_AIC79XX=y
+CONFIG_AIC79XX_CMDS_PER_DEVICE=32
+CONFIG_AIC79XX_RESET_DELAY_MS=4000
+# CONFIG_AIC79XX_ENABLE_RD_STRM is not set
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+CONFIG_SCSI_SATA=y
+# CONFIG_SCSI_SATA_AHCI is not set
+# CONFIG_SCSI_SATA_SVW is not set
+CONFIG_SCSI_ATA_PIIX=y
+# CONFIG_SCSI_SATA_NV is not set
+# CONFIG_SCSI_SATA_PROMISE is not set
+# CONFIG_SCSI_SATA_QSTOR is not set
+# CONFIG_SCSI_SATA_SX4 is not set
+# CONFIG_SCSI_SATA_SIL is not set
+# CONFIG_SCSI_SATA_SIS is not set
+# CONFIG_SCSI_SATA_ULI is not set
+CONFIG_SCSI_SATA_VIA=y
+# CONFIG_SCSI_SATA_VITESSE is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_EATA is not set
+# CONFIG_SCSI_EATA_PIO is not set
+# CONFIG_SCSI_FUTURE_DOMAIN is not set
+# CONFIG_SCSI_GDTH is not set
+# CONFIG_SCSI_IPS is not set
+# CONFIG_SCSI_INITIO is not set
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_SYM53C8XX_2 is not set
+# CONFIG_SCSI_IPR is not set
+# CONFIG_SCSI_QLOGIC_ISP is not set
+# CONFIG_SCSI_QLOGIC_FC is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+CONFIG_SCSI_QLA2XXX=y
+# CONFIG_SCSI_QLA21XX is not set
+# CONFIG_SCSI_QLA22XX is not set
+# CONFIG_SCSI_QLA2300 is not set
+# CONFIG_SCSI_QLA2322 is not set
+# CONFIG_SCSI_QLA6312 is not set
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_DC390T is not set
+# CONFIG_SCSI_DEBUG is not set
+
+#
+# Multi-device support (RAID and LVM)
+#
+# CONFIG_MD is not set
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=y
+CONFIG_FUSION_MAX_SGE=40
+# CONFIG_FUSION_CTL is not set
+
+#
+# IEEE 1394 (FireWire) support
+#
+# CONFIG_IEEE1394 is not set
+
+#
+# I2O device support
+#
+# CONFIG_I2O is not set
+
+#
+# Networking support
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+# CONFIG_PACKET_MMAP is not set
+# CONFIG_NETLINK_DEV is not set
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+# CONFIG_IP_ADVANCED_ROUTER is not set
+# CONFIG_IP_PNP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_IP_MROUTE is not set
+# CONFIG_ARPD is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_TUNNEL is not set
+CONFIG_IP_TCPDIAG=y
+CONFIG_IP_TCPDIAG_IPV6=y
+CONFIG_IPV6=y
+# CONFIG_IPV6_PRIVACY is not set
+# CONFIG_INET6_AH is not set
+# CONFIG_INET6_ESP is not set
+# CONFIG_INET6_IPCOMP is not set
+# CONFIG_INET6_TUNNEL is not set
+# CONFIG_IPV6_TUNNEL is not set
+# CONFIG_NETFILTER is not set
+
+#
+# SCTP Configuration (EXPERIMENTAL)
+#
+# CONFIG_IP_SCTP is not set
+# CONFIG_ATM is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_DECNET is not set
+# CONFIG_LLC2 is not set
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_NET_DIVERT is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+
+#
+# QoS and/or fair queueing
+#
+# CONFIG_NET_SCHED is not set
+# CONFIG_NET_CLS_ROUTE is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+CONFIG_NETPOLL=y
+# CONFIG_NETPOLL_RX is not set
+# CONFIG_NETPOLL_TRAP is not set
+CONFIG_NET_POLL_CONTROLLER=y
+# CONFIG_HAMRADIO is not set
+# CONFIG_IRDA is not set
+# CONFIG_BT is not set
+CONFIG_NETDEVICES=y
+# CONFIG_DUMMY is not set
+# CONFIG_BONDING is not set
+# CONFIG_EQUALIZER is not set
+# CONFIG_TUN is not set
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNGEM is not set
+# CONFIG_NET_VENDOR_3COM is not set
+
+#
+# Tulip family network device support
+#
+# CONFIG_NET_TULIP is not set
+# CONFIG_HP100 is not set
+CONFIG_NET_PCI=y
+# CONFIG_PCNET32 is not set
+CONFIG_AMD8111_ETH=y
+# CONFIG_AMD8111E_NAPI is not set
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_B44 is not set
+CONFIG_FORCEDETH=y
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+# CONFIG_E100 is not set
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+# CONFIG_NE2K_PCI is not set
+CONFIG_8139CP=m
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+# CONFIG_8139TOO_8129 is not set
+# CONFIG_8139_OLD_RX_RESET is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_VIA_RHINE is not set
+
+#
+# Ethernet (1000 Mbit)
+#
+# CONFIG_ACENIC is not set
+# CONFIG_DL2K is not set
+CONFIG_E1000=y
+# CONFIG_E1000_NAPI is not set
+# CONFIG_NS83820 is not set
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+# CONFIG_R8169 is not set
+# CONFIG_SK98LIN is not set
+# CONFIG_VIA_VELOCITY is not set
+CONFIG_TIGON3=y
+
+#
+# Ethernet (10000 Mbit)
+#
+# CONFIG_IXGB is not set
+CONFIG_S2IO=m
+# CONFIG_S2IO_NAPI is not set
+# CONFIG_2BUFF_MODE is not set
+
+#
+# Token Ring devices
+#
+# CONFIG_TR is not set
+
+#
+# Wireless LAN (non-hamradio)
+#
+# CONFIG_NET_RADIO is not set
+
+#
+# Wan interfaces
+#
+# CONFIG_WAN is not set
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+# CONFIG_PPP is not set
+# CONFIG_SLIP is not set
+# CONFIG_NET_FC is not set
+# CONFIG_SHAPER is not set
+CONFIG_NETCONSOLE=y
+
+#
+# ISDN subsystem
+#
+# CONFIG_ISDN is not set
+
+#
+# Telephony Support
+#
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+# CONFIG_INPUT_JOYDEV is not set
+# CONFIG_INPUT_TSDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+# CONFIG_MOUSE_SERIAL is not set
+# CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+# CONFIG_INPUT_MISC is not set
+
+#
+# Hardware I/O ports
+#
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_SERIO_CT82C710 is not set
+# CONFIG_SERIO_PCIPS2 is not set
+CONFIG_SERIO_LIBPS2=y
+# CONFIG_SERIO_RAW is not set
+# CONFIG_GAMEPORT is not set
+CONFIG_SOUND_GAMEPORT=y
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+# CONFIG_SERIAL_8250_ACPI is not set
+CONFIG_SERIAL_8250_NR_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+
+#
+# IPMI
+#
+# CONFIG_IPMI_HANDLER is not set
+
+#
+# Watchdog Cards
+#
+# CONFIG_WATCHDOG is not set
+CONFIG_HW_RANDOM=y
+# CONFIG_NVRAM is not set
+CONFIG_RTC=y
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+# CONFIG_DRM is not set
+# CONFIG_MWAVE is not set
+CONFIG_RAW_DRIVER=y
+CONFIG_HPET=y
+# CONFIG_HPET_RTC_IRQ is not set
+CONFIG_HPET_MMAP=y
+CONFIG_MAX_RAW_DEVS=256
+CONFIG_HANGCHECK_TIMER=y
+
+#
+# TPM devices
+#
+# CONFIG_TCG_TPM is not set
+
+#
+# I2C support
+#
+# CONFIG_I2C is not set
+
+#
+# Dallas's 1-wire bus
+#
+# CONFIG_W1 is not set
+
+#
+# Misc devices
+#
+# CONFIG_IBM_ASM is not set
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+
+#
+# Digital Video Broadcasting Devices
+#
+# CONFIG_DVB is not set
+
+#
+# Graphics support
+#
+# CONFIG_FB is not set
+CONFIG_VIDEO_SELECT=y
+
+#
+# Console display driver support
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_DUMMY_CONSOLE=y
+
+#
+# Sound
+#
+CONFIG_SOUND=y
+
+#
+# Advanced Linux Sound Architecture
+#
+# CONFIG_SND is not set
+
+#
+# Open Sound System
+#
+CONFIG_SOUND_PRIME=y
+# CONFIG_SOUND_BT878 is not set
+# CONFIG_SOUND_CMPCI is not set
+# CONFIG_SOUND_EMU10K1 is not set
+# CONFIG_SOUND_FUSION is not set
+# CONFIG_SOUND_CS4281 is not set
+# CONFIG_SOUND_ES1370 is not set
+# CONFIG_SOUND_ES1371 is not set
+# CONFIG_SOUND_ESSSOLO1 is not set
+# CONFIG_SOUND_MAESTRO is not set
+# CONFIG_SOUND_MAESTRO3 is not set
+CONFIG_SOUND_ICH=y
+# CONFIG_SOUND_SONICVIBES is not set
+# CONFIG_SOUND_TRIDENT is not set
+# CONFIG_SOUND_MSNDCLAS is not set
+# CONFIG_SOUND_MSNDPIN is not set
+# CONFIG_SOUND_VIA82CXXX is not set
+# CONFIG_SOUND_OSS is not set
+# CONFIG_SOUND_ALI5455 is not set
+# CONFIG_SOUND_FORTE is not set
+# CONFIG_SOUND_RME96XX is not set
+# CONFIG_SOUND_AD1980 is not set
+
+#
+# USB support
+#
+CONFIG_USB=y
+# CONFIG_USB_DEBUG is not set
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_SUSPEND is not set
+# CONFIG_USB_OTG is not set
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB_ARCH_HAS_OHCI=y
+
+#
+# USB Host Controller Drivers
+#
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_SPLIT_ISO is not set
+# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
+CONFIG_USB_OHCI_HCD=y
+# CONFIG_USB_OHCI_BIG_ENDIAN is not set
+CONFIG_USB_OHCI_LITTLE_ENDIAN=y
+CONFIG_USB_UHCI_HCD=y
+# CONFIG_USB_SL811_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_USB_AUDIO is not set
+# CONFIG_USB_BLUETOOTH_TTY is not set
+# CONFIG_USB_MIDI is not set
+# CONFIG_USB_ACM is not set
+CONFIG_USB_PRINTER=y
+
+#
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information
+#
+CONFIG_USB_STORAGE=y
+# CONFIG_USB_STORAGE_DEBUG is not set
+# CONFIG_USB_STORAGE_RW_DETECT is not set
+# CONFIG_USB_STORAGE_DATAFAB is not set
+# CONFIG_USB_STORAGE_FREECOM is not set
+# CONFIG_USB_STORAGE_ISD200 is not set
+# CONFIG_USB_STORAGE_DPCM is not set
+# CONFIG_USB_STORAGE_USBAT is not set
+# CONFIG_USB_STORAGE_SDDR09 is not set
+# CONFIG_USB_STORAGE_SDDR55 is not set
+# CONFIG_USB_STORAGE_JUMPSHOT is not set
+
+#
+# USB Input Devices
+#
+CONFIG_USB_HID=y
+CONFIG_USB_HIDINPUT=y
+# CONFIG_HID_FF is not set
+# CONFIG_USB_HIDDEV is not set
+# CONFIG_USB_AIPTEK is not set
+# CONFIG_USB_WACOM is not set
+# CONFIG_USB_KBTAB is not set
+# CONFIG_USB_POWERMATE is not set
+# CONFIG_USB_MTOUCH is not set
+# CONFIG_USB_EGALAX is not set
+# CONFIG_USB_XPAD is not set
+# CONFIG_USB_ATI_REMOTE is not set
+
+#
+# USB Imaging devices
+#
+# CONFIG_USB_MDC800 is not set
+# CONFIG_USB_MICROTEK is not set
+
+#
+# USB Multimedia devices
+#
+# CONFIG_USB_DABUSB is not set
+
+#
+# Video4Linux support is needed for USB Multimedia device support
+#
+
+#
+# USB Network Adapters
+#
+# CONFIG_USB_CATC is not set
+# CONFIG_USB_KAWETH is not set
+# CONFIG_USB_PEGASUS is not set
+# CONFIG_USB_RTL8150 is not set
+# CONFIG_USB_USBNET is not set
+CONFIG_USB_MON=y
+
+#
+# USB port drivers
+#
+
+#
+# USB Serial Converter support
+#
+# CONFIG_USB_SERIAL is not set
+
+#
+# USB Miscellaneous drivers
+#
+# CONFIG_USB_EMI62 is not set
+# CONFIG_USB_EMI26 is not set
+# CONFIG_USB_AUERSWALD is not set
+# CONFIG_USB_RIO500 is not set
+# CONFIG_USB_LEGOTOWER is not set
+# CONFIG_USB_LCD is not set
+# CONFIG_USB_LED is not set
+# CONFIG_USB_CYTHERM is not set
+# CONFIG_USB_PHIDGETKIT is not set
+# CONFIG_USB_PHIDGETSERVO is not set
+# CONFIG_USB_IDMOUSE is not set
+# CONFIG_USB_SISUSBVGA is not set
+# CONFIG_USB_TEST is not set
+
+#
+# USB ATM/DSL drivers
+#
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+
+#
+# MMC/SD Card support
+#
+# CONFIG_MMC is not set
+
+#
+# InfiniBand support
+#
+# CONFIG_INFINIBAND is not set
+
+#
+# Firmware Drivers
+#
+# CONFIG_EDD is not set
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+# CONFIG_EXT2_FS_SECURITY is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+# CONFIG_EXT3_FS_SECURITY is not set
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=y
+# CONFIG_REISERFS_CHECK is not set
+# CONFIG_REISERFS_PROC_INFO is not set
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+# CONFIG_REISERFS_FS_SECURITY is not set
+# CONFIG_JFS_FS is not set
+CONFIG_FS_POSIX_ACL=y
+
+#
+# XFS support
+#
+# CONFIG_XFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_ROMFS_FS is not set
+# CONFIG_QUOTA is not set
+CONFIG_DNOTIFY=y
+CONFIG_AUTOFS_FS=y
+# CONFIG_AUTOFS4_FS is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+# CONFIG_JOLIET is not set
+# CONFIG_ZISOFS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_SYSFS=y
+# CONFIG_DEVFS_FS is not set
+# CONFIG_DEVPTS_FS_XATTR is not set
+CONFIG_TMPFS=y
+# CONFIG_TMPFS_XATTR is not set
+CONFIG_HUGETLBFS=y
+CONFIG_HUGETLB_PAGE=y
+CONFIG_RAMFS=y
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_HFSPLUS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V4 is not set
+# CONFIG_NFS_DIRECTIO is not set
+CONFIG_NFSD=y
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_V4 is not set
+CONFIG_NFSD_TCP=y
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=y
+CONFIG_SUNRPC=y
+# CONFIG_RPCSEC_GSS_KRB5 is not set
+# CONFIG_RPCSEC_GSS_SPKM3 is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_CIFS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=y
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+# CONFIG_NLS_CODEPAGE_850 is not set
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+# CONFIG_NLS_CODEPAGE_932 is not set
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+CONFIG_NLS_ISO8859_15=y
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+CONFIG_NLS_UTF8=y
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_KERNEL=y
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_PRINTK_TIME is not set
+# CONFIG_SCHEDSTATS is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_KOBJECT is not set
+# CONFIG_DEBUG_INFO is not set
+CONFIG_DEBUG_FS=y
+CONFIG_INIT_DEBUG=y
+# CONFIG_IOMMU_DEBUG is not set
+CONFIG_KPROBES=y
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITY is not set
+
+#
+# Cryptographic options
+#
+# CONFIG_CRYPTO is not set
+
+#
+# Hardware crypto devices
+#
+
+#
+# Library routines
+#
+# CONFIG_CRC_CCITT is not set
+CONFIG_CRC32=y
+# CONFIG_LIBCRC32C is not set
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
new file mode 100644
index 000000000000..a12b19da4b59
--- /dev/null
+++ b/arch/x86_64/ia32/Makefile
@@ -0,0 +1,32 @@
+#
+# Makefile for the ia32 kernel emulation subsystem.
+#
+
+obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \
+ ia32_signal.o tls32.o \
+ ia32_binfmt.o fpu32.o ptrace32.o syscall32.o
+
+sysv-$(CONFIG_SYSVIPC) := ipc32.o
+obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
+
+obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
+
+$(obj)/syscall32.o: $(src)/syscall32.c \
+ $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
+
+# Teach kbuild about targets
+targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
+
+# The DSO images are built using a special linker script
+quiet_cmd_syscall = SYSCALL $@
+ cmd_syscall = $(CC) -m32 -nostdlib -shared -s \
+ -Wl,-soname=linux-gate.so.1 -o $@ \
+ -Wl,-T,$(filter-out FORCE,$^)
+
+$(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
+$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
+ $(call if_changed,syscall)
+
+AFLAGS_vsyscall-sysenter.o = -m32
+AFLAGS_vsyscall-syscall.o = -m32
+CFLAGS_ia32_ioctl.o += -Ifs/
diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c
new file mode 100644
index 000000000000..1c23095f1813
--- /dev/null
+++ b/arch/x86_64/ia32/fpu32.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
+ * This is used for ptrace, signals and coredumps in 32bit emulation.
+ * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $
+ */
+
+#include <linux/sched.h>
+#include <asm/sigcontext32.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+ unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+ /* Transform each pair of bits into 01 (valid) or 00 (empty) */
+ tmp = ~twd;
+ tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+ /* and move the valid bits to the lower byte. */
+ tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+ tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+ tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+ return tmp;
+}
+
+static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
+{
+ struct _fpxreg *st = NULL;
+ unsigned long tos = (fxsave->swd >> 11) & 7;
+ unsigned long twd = (unsigned long) fxsave->twd;
+ unsigned long tag;
+ unsigned long ret = 0xffff0000;
+ int i;
+
+#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16);
+
+ for (i = 0 ; i < 8 ; i++) {
+ if (twd & 0x1) {
+ st = FPREG_ADDR( fxsave, (i - tos) & 7 );
+
+ switch (st->exponent & 0x7fff) {
+ case 0x7fff:
+ tag = 2; /* Special */
+ break;
+ case 0x0000:
+ if ( !st->significand[0] &&
+ !st->significand[1] &&
+ !st->significand[2] &&
+ !st->significand[3] ) {
+ tag = 1; /* Zero */
+ } else {
+ tag = 2; /* Special */
+ }
+ break;
+ default:
+ if (st->significand[3] & 0x8000) {
+ tag = 0; /* Valid */
+ } else {
+ tag = 2; /* Special */
+ }
+ break;
+ }
+ } else {
+ tag = 3; /* Empty */
+ }
+ ret |= (tag << (2 * i));
+ twd = twd >> 1;
+ }
+ return ret;
+}
+
+
+static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave,
+ struct _fpstate_ia32 __user *buf)
+{
+ struct _fpxreg *to;
+ struct _fpreg __user *from;
+ int i;
+ u32 v;
+ int err = 0;
+
+#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf)
+ G(0, fxsave->cwd);
+ G(1, fxsave->swd);
+ G(2, fxsave->twd);
+ fxsave->twd = twd_i387_to_fxsr(fxsave->twd);
+ G(3, fxsave->rip);
+ G(4, v);
+ fxsave->fop = v>>16; /* cs ignored */
+ G(5, fxsave->rdp);
+ /* 6: ds ignored */
+#undef G
+ if (err)
+ return -1;
+
+ to = (struct _fpxreg *)&fxsave->st_space[0];
+ from = &buf->_st[0];
+ for (i = 0 ; i < 8 ; i++, to++, from++) {
+ if (__copy_from_user(to, from, sizeof(*from)))
+ return -1;
+ }
+ return 0;
+}
+
+
+static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf,
+ struct i387_fxsave_struct *fxsave,
+ struct pt_regs *regs,
+ struct task_struct *tsk)
+{
+ struct _fpreg __user *to;
+ struct _fpxreg *from;
+ int i;
+ u16 cs,ds;
+ int err = 0;
+
+ if (tsk == current) {
+ /* should be actually ds/cs at fpu exception time,
+ but that information is not available in 64bit mode. */
+ asm("movw %%ds,%0 " : "=r" (ds));
+ asm("movw %%cs,%0 " : "=r" (cs));
+ } else { /* ptrace. task has stopped. */
+ ds = tsk->thread.ds;
+ cs = regs->cs;
+ }
+
+#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf)
+ P(0, (u32)fxsave->cwd | 0xffff0000);
+ P(1, (u32)fxsave->swd | 0xffff0000);
+ P(2, twd_fxsr_to_i387(fxsave));
+ P(3, (u32)fxsave->rip);
+ P(4, cs | ((u32)fxsave->fop) << 16);
+ P(5, fxsave->rdp);
+ P(6, 0xffff0000 | ds);
+#undef P
+
+ if (err)
+ return -1;
+
+ to = &buf->_st[0];
+ from = (struct _fpxreg *) &fxsave->st_space[0];
+ for ( i = 0 ; i < 8 ; i++, to++, from++ ) {
+ if (__copy_to_user(to, from, sizeof(*to)))
+ return -1;
+ }
+ return 0;
+}
+
+int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave)
+{
+ clear_fpu(tsk);
+ if (!fsave) {
+ if (__copy_from_user(&tsk->thread.i387.fxsave,
+ &buf->_fxsr_env[0],
+ sizeof(struct i387_fxsave_struct)))
+ return -1;
+ tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+ set_stopped_child_used_math(tsk);
+ }
+ return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf);
+}
+
+int save_i387_ia32(struct task_struct *tsk,
+ struct _fpstate_ia32 __user *buf,
+ struct pt_regs *regs,
+ int fsave)
+{
+ int err = 0;
+
+ init_fpu(tsk);
+ if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk))
+ return -1;
+ if (fsave)
+ return 0;
+ err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+ if (fsave)
+ return err ? -1 : 1;
+ err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
+ err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+ sizeof(struct i387_fxsave_struct));
+ return err ? -1 : 1;
+}
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
new file mode 100644
index 000000000000..1965efc974dc
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -0,0 +1,529 @@
+/*
+ * a.out loader for x86-64
+ *
+ * Copyright (C) 1991, 1992, 1996 Linus Torvalds
+ * Hacked together by Andi Kleen
+ */
+
+#include <linux/module.h>
+
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/a.out.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/ptrace.h>
+#include <linux/user.h>
+#include <linux/slab.h>
+#include <linux/binfmts.h>
+#include <linux/personality.h>
+#include <linux/init.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/user32.h>
+#include <asm/ia32.h>
+
+#undef WARN_OLD
+#undef CORE_DUMP /* probably broken */
+
+extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
+ unsigned long stack_top, int exec_stack);
+
+static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_library(struct file*);
+
+#if CORE_DUMP
+static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
+
+/*
+ * fill in the user structure for a core dump..
+ */
+static void dump_thread32(struct pt_regs * regs, struct user32 * dump)
+{
+ u32 fs,gs;
+
+/* changed the size calculations - should hopefully work better. lbt */
+ dump->magic = CMAGIC;
+ dump->start_code = 0;
+ dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1);
+ dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+ dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
+ dump->u_dsize -= dump->u_tsize;
+ dump->u_ssize = 0;
+ dump->u_debugreg[0] = current->thread.debugreg0;
+ dump->u_debugreg[1] = current->thread.debugreg1;
+ dump->u_debugreg[2] = current->thread.debugreg2;
+ dump->u_debugreg[3] = current->thread.debugreg3;
+ dump->u_debugreg[4] = 0;
+ dump->u_debugreg[5] = 0;
+ dump->u_debugreg[6] = current->thread.debugreg6;
+ dump->u_debugreg[7] = current->thread.debugreg7;
+
+ if (dump->start_stack < 0xc0000000)
+ dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT;
+
+ dump->regs.ebx = regs->rbx;
+ dump->regs.ecx = regs->rcx;
+ dump->regs.edx = regs->rdx;
+ dump->regs.esi = regs->rsi;
+ dump->regs.edi = regs->rdi;
+ dump->regs.ebp = regs->rbp;
+ dump->regs.eax = regs->rax;
+ dump->regs.ds = current->thread.ds;
+ dump->regs.es = current->thread.es;
+ asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs;
+ asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs;
+ dump->regs.orig_eax = regs->orig_rax;
+ dump->regs.eip = regs->rip;
+ dump->regs.cs = regs->cs;
+ dump->regs.eflags = regs->eflags;
+ dump->regs.esp = regs->rsp;
+ dump->regs.ss = regs->ss;
+
+#if 1 /* FIXME */
+ dump->u_fpvalid = 0;
+#else
+ dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+#endif
+}
+
+#endif
+
+static struct linux_binfmt aout_format = {
+ .module = THIS_MODULE,
+ .load_binary = load_aout_binary,
+ .load_shlib = load_aout_library,
+#if CORE_DUMP
+ .core_dump = aout_core_dump,
+#endif
+ .min_coredump = PAGE_SIZE
+};
+
+static void set_brk(unsigned long start, unsigned long end)
+{
+ start = PAGE_ALIGN(start);
+ end = PAGE_ALIGN(end);
+ if (end <= start)
+ return;
+ down_write(&current->mm->mmap_sem);
+ do_brk(start, end - start);
+ up_write(&current->mm->mmap_sem);
+}
+
+#if CORE_DUMP
+/*
+ * These are the only things you should do on a core-file: use only these
+ * macros to write out all the necessary info.
+ */
+
+static int dump_write(struct file *file, const void *addr, int nr)
+{
+ return file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+
+#define DUMP_WRITE(addr, nr) \
+ if (!dump_write(file, (void *)(addr), (nr))) \
+ goto end_coredump;
+
+#define DUMP_SEEK(offset) \
+if (file->f_op->llseek) { \
+ if (file->f_op->llseek(file,(offset),0) != (offset)) \
+ goto end_coredump; \
+} else file->f_pos = (offset)
+
+/*
+ * Routine writes a core dump image in the current directory.
+ * Currently only a stub-function.
+ *
+ * Note that setuid/setgid files won't make a core-dump if the uid/gid
+ * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
+ * field, which also makes sure the core-dumps won't be recursive if the
+ * dumping of the process results in another error..
+ */
+
+static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file)
+{
+ mm_segment_t fs;
+ int has_dumped = 0;
+ unsigned long dump_start, dump_size;
+ struct user32 dump;
+# define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
+# define START_STACK(u) (u.start_stack)
+
+ fs = get_fs();
+ set_fs(KERNEL_DS);
+ has_dumped = 1;
+ current->flags |= PF_DUMPCORE;
+ strncpy(dump.u_comm, current->comm, sizeof(current->comm));
+ dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump)));
+ dump.signal = signr;
+ dump_thread32(regs, &dump);
+
+/* If the size of the dump file exceeds the rlimit, then see what would happen
+ if we wrote the stack, but not the data area. */
+ if ((dump.u_dsize+dump.u_ssize+1) * PAGE_SIZE >
+ current->signal->rlim[RLIMIT_CORE].rlim_cur)
+ dump.u_dsize = 0;
+
+/* Make sure we have enough room to write the stack and data areas. */
+ if ((dump.u_ssize+1) * PAGE_SIZE >
+ current->signal->rlim[RLIMIT_CORE].rlim_cur)
+ dump.u_ssize = 0;
+
+/* make sure we actually have a data and stack area to dump */
+ set_fs(USER_DS);
+ if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
+ dump.u_dsize = 0;
+ if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
+ dump.u_ssize = 0;
+
+ set_fs(KERNEL_DS);
+/* struct user */
+ DUMP_WRITE(&dump,sizeof(dump));
+/* Now dump all of the user data. Include malloced stuff as well */
+ DUMP_SEEK(PAGE_SIZE);
+/* now we start writing out the user space info */
+ set_fs(USER_DS);
+/* Dump the data area */
+ if (dump.u_dsize != 0) {
+ dump_start = START_DATA(dump);
+ dump_size = dump.u_dsize << PAGE_SHIFT;
+ DUMP_WRITE(dump_start,dump_size);
+ }
+/* Now prepare to dump the stack area */
+ if (dump.u_ssize != 0) {
+ dump_start = START_STACK(dump);
+ dump_size = dump.u_ssize << PAGE_SHIFT;
+ DUMP_WRITE(dump_start,dump_size);
+ }
+/* Finally dump the task struct. Not be used by gdb, but could be useful */
+ set_fs(KERNEL_DS);
+ DUMP_WRITE(current,sizeof(*current));
+end_coredump:
+ set_fs(fs);
+ return has_dumped;
+}
+#endif
+
+/*
+ * create_aout_tables() parses the env- and arg-strings in new user
+ * memory and creates the pointer tables from them, and puts their
+ * addresses on the "stack", returning the new stack pointer value.
+ */
+static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm)
+{
+ u32 __user *argv;
+ u32 __user *envp;
+ u32 __user *sp;
+ int argc = bprm->argc;
+ int envc = bprm->envc;
+
+ sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p);
+ sp -= envc+1;
+ envp = sp;
+ sp -= argc+1;
+ argv = sp;
+ put_user((unsigned long) envp,--sp);
+ put_user((unsigned long) argv,--sp);
+ put_user(argc,--sp);
+ current->mm->arg_start = (unsigned long) p;
+ while (argc-->0) {
+ char c;
+ put_user((u32)(unsigned long)p,argv++);
+ do {
+ get_user(c,p++);
+ } while (c);
+ }
+ put_user(NULL,argv);
+ current->mm->arg_end = current->mm->env_start = (unsigned long) p;
+ while (envc-->0) {
+ char c;
+ put_user((u32)(unsigned long)p,envp++);
+ do {
+ get_user(c,p++);
+ } while (c);
+ }
+ put_user(NULL,envp);
+ current->mm->env_end = (unsigned long) p;
+ return sp;
+}
+
+/*
+ * These are the functions used to load a.out style executables and shared
+ * libraries. There is no binary dependent code anywhere else.
+ */
+
+static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+{
+ struct exec ex;
+ unsigned long error;
+ unsigned long fd_offset;
+ unsigned long rlim;
+ int retval;
+
+ ex = *((struct exec *) bprm->buf); /* exec-header */
+ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
+ N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
+ N_TRSIZE(ex) || N_DRSIZE(ex) ||
+ i_size_read(bprm->file->f_dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+ return -ENOEXEC;
+ }
+
+ fd_offset = N_TXTOFF(ex);
+
+ /* Check initial limits. This avoids letting people circumvent
+ * size limits imposed on them by creating programs with large
+ * arrays in the data or bss.
+ */
+ rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+ if (rlim >= RLIM_INFINITY)
+ rlim = ~0;
+ if (ex.a_data + ex.a_bss > rlim)
+ return -ENOMEM;
+
+ /* Flush all traces of the currently running executable */
+ retval = flush_old_exec(bprm);
+ if (retval)
+ return retval;
+
+ regs->cs = __USER32_CS;
+ regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
+ regs->r13 = regs->r14 = regs->r15 = 0;
+
+ /* OK, This is the point of no return */
+ set_personality(PER_LINUX);
+ set_thread_flag(TIF_IA32);
+ clear_thread_flag(TIF_ABI_PENDING);
+
+ current->mm->end_code = ex.a_text +
+ (current->mm->start_code = N_TXTADDR(ex));
+ current->mm->end_data = ex.a_data +
+ (current->mm->start_data = N_DATADDR(ex));
+ current->mm->brk = ex.a_bss +
+ (current->mm->start_brk = N_BSSADDR(ex));
+ current->mm->free_area_cache = TASK_UNMAPPED_BASE;
+
+ set_mm_counter(current->mm, rss, 0);
+ current->mm->mmap = NULL;
+ compute_creds(bprm);
+ current->flags &= ~PF_FORKNOEXEC;
+
+ if (N_MAGIC(ex) == OMAGIC) {
+ unsigned long text_addr, map_size;
+ loff_t pos;
+
+ text_addr = N_TXTADDR(ex);
+
+ pos = 32;
+ map_size = ex.a_text+ex.a_data;
+
+ down_write(&current->mm->mmap_sem);
+ error = do_brk(text_addr & PAGE_MASK, map_size);
+ up_write(&current->mm->mmap_sem);
+
+ if (error != (text_addr & PAGE_MASK)) {
+ send_sig(SIGKILL, current, 0);
+ return error;
+ }
+
+ error = bprm->file->f_op->read(bprm->file, (char *)text_addr,
+ ex.a_text+ex.a_data, &pos);
+ if ((signed long)error < 0) {
+ send_sig(SIGKILL, current, 0);
+ return error;
+ }
+
+ flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
+ } else {
+#ifdef WARN_OLD
+ static unsigned long error_time, error_time2;
+ if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
+ (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ)
+ {
+ printk(KERN_NOTICE "executable not page aligned\n");
+ error_time2 = jiffies;
+ }
+
+ if ((fd_offset & ~PAGE_MASK) != 0 &&
+ (jiffies-error_time) > 5*HZ)
+ {
+ printk(KERN_WARNING
+ "fd_offset is not page aligned. Please convert program: %s\n",
+ bprm->file->f_dentry->d_name.name);
+ error_time = jiffies;
+ }
+#endif
+
+ if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
+ loff_t pos = fd_offset;
+ down_write(&current->mm->mmap_sem);
+ do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
+ up_write(&current->mm->mmap_sem);
+ bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex),
+ ex.a_text+ex.a_data, &pos);
+ flush_icache_range((unsigned long) N_TXTADDR(ex),
+ (unsigned long) N_TXTADDR(ex) +
+ ex.a_text+ex.a_data);
+ goto beyond_if;
+ }
+
+ down_write(&current->mm->mmap_sem);
+ error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
+ PROT_READ | PROT_EXEC,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
+ fd_offset);
+ up_write(&current->mm->mmap_sem);
+
+ if (error != N_TXTADDR(ex)) {
+ send_sig(SIGKILL, current, 0);
+ return error;
+ }
+
+ down_write(&current->mm->mmap_sem);
+ error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT,
+ fd_offset + ex.a_text);
+ up_write(&current->mm->mmap_sem);
+ if (error != N_DATADDR(ex)) {
+ send_sig(SIGKILL, current, 0);
+ return error;
+ }
+ }
+beyond_if:
+ set_binfmt(&aout_format);
+
+ set_brk(current->mm->start_brk, current->mm->brk);
+
+ retval = ia32_setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
+ if (retval < 0) {
+ /* Someone check-me: is this error path enough? */
+ send_sig(SIGKILL, current, 0);
+ return retval;
+ }
+
+ current->mm->start_stack =
+ (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
+ /* start thread */
+ asm volatile("movl %0,%%fs" :: "r" (0)); \
+ asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS));
+ load_gs_index(0);
+ (regs)->rip = ex.a_entry;
+ (regs)->rsp = current->mm->start_stack;
+ (regs)->eflags = 0x200;
+ (regs)->cs = __USER32_CS;
+ (regs)->ss = __USER32_DS;
+ set_fs(USER_DS);
+ if (unlikely(current->ptrace & PT_PTRACED)) {
+ if (current->ptrace & PT_TRACE_EXEC)
+ ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
+ else
+ send_sig(SIGTRAP, current, 0);
+ }
+ return 0;
+}
+
+static int load_aout_library(struct file *file)
+{
+ struct inode * inode;
+ unsigned long bss, start_addr, len;
+ unsigned long error;
+ int retval;
+ struct exec ex;
+
+ inode = file->f_dentry->d_inode;
+
+ retval = -ENOEXEC;
+ error = kernel_read(file, 0, (char *) &ex, sizeof(ex));
+ if (error != sizeof(ex))
+ goto out;
+
+ /* We come in here for the regular a.out style of shared libraries */
+ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
+ N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
+ i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
+ goto out;
+ }
+
+ if (N_FLAGS(ex))
+ goto out;
+
+ /* For QMAGIC, the starting address is 0x20 into the page. We mask
+ this off to get the starting address for the page */
+
+ start_addr = ex.a_entry & 0xfffff000;
+
+ if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
+ loff_t pos = N_TXTOFF(ex);
+
+#ifdef WARN_OLD
+ static unsigned long error_time;
+ if ((jiffies-error_time) > 5*HZ)
+ {
+ printk(KERN_WARNING
+ "N_TXTOFF is not page aligned. Please convert library: %s\n",
+ file->f_dentry->d_name.name);
+ error_time = jiffies;
+ }
+#endif
+ down_write(&current->mm->mmap_sem);
+ do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
+ up_write(&current->mm->mmap_sem);
+
+ file->f_op->read(file, (char *)start_addr,
+ ex.a_text + ex.a_data, &pos);
+ flush_icache_range((unsigned long) start_addr,
+ (unsigned long) start_addr + ex.a_text + ex.a_data);
+
+ retval = 0;
+ goto out;
+ }
+ /* Now use mmap to map the library into memory. */
+ down_write(&current->mm->mmap_sem);
+ error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
+ N_TXTOFF(ex));
+ up_write(&current->mm->mmap_sem);
+ retval = error;
+ if (error != start_addr)
+ goto out;
+
+ len = PAGE_ALIGN(ex.a_text + ex.a_data);
+ bss = ex.a_text + ex.a_data + ex.a_bss;
+ if (bss > len) {
+ down_write(&current->mm->mmap_sem);
+ error = do_brk(start_addr + len, bss - len);
+ up_write(&current->mm->mmap_sem);
+ retval = error;
+ if (error != start_addr + len)
+ goto out;
+ }
+ retval = 0;
+out:
+ return retval;
+}
+
+static int __init init_aout_binfmt(void)
+{
+ return register_binfmt(&aout_format);
+}
+
+static void __exit exit_aout_binfmt(void)
+{
+ unregister_binfmt(&aout_format);
+}
+
+module_init(init_aout_binfmt);
+module_exit(exit_aout_binfmt);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
new file mode 100644
index 000000000000..93d568dfa762
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -0,0 +1,434 @@
+/*
+ * Written 2000,2002 by Andi Kleen.
+ *
+ * Loosely based on the sparc64 and IA64 32bit emulation loaders.
+ * This tricks binfmt_elf.c into loading 32bit binaries using lots
+ * of ugly preprocessor tricks. Talk about very very poor man's inheritance.
+ */
+#include <linux/types.h>
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/compat.h>
+#include <linux/string.h>
+#include <linux/binfmts.h>
+#include <linux/mm.h>
+#include <linux/security.h>
+
+#include <asm/segment.h>
+#include <asm/ptrace.h>
+#include <asm/processor.h>
+#include <asm/user32.h>
+#include <asm/sigcontext32.h>
+#include <asm/fpu32.h>
+#include <asm/i387.h>
+#include <asm/uaccess.h>
+#include <asm/ia32.h>
+#include <asm/vsyscall32.h>
+
+#define ELF_NAME "elf/i386"
+
+#define AT_SYSINFO 32
+#define AT_SYSINFO_EHDR 33
+
+int sysctl_vsyscall32 = 1;
+
+#define ARCH_DLINFO do { \
+ if (sysctl_vsyscall32) { \
+ NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \
+ NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \
+ } \
+} while(0)
+
+struct file;
+struct elf_phdr;
+
+#define IA32_EMULATOR 1
+
+#define ELF_ET_DYN_BASE (TASK_UNMAPPED_32 + 0x1000000)
+
+#undef ELF_ARCH
+#define ELF_ARCH EM_386
+
+#undef ELF_CLASS
+#define ELF_CLASS ELFCLASS32
+
+#define ELF_DATA ELFDATA2LSB
+
+#define USE_ELF_CORE_DUMP 1
+
+/* Overwrite elfcore.h */
+#define _LINUX_ELFCORE_H 1
+typedef unsigned int elf_greg_t;
+
+#define ELF_NGREG (sizeof (struct user_regs_struct32) / sizeof(elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+/*
+ * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out
+ * extra segments containing the vsyscall DSO contents. Dumping its
+ * contents makes post-mortem fully interpretable later without matching up
+ * the same kernel and hardware config to see what PC values meant.
+ * Dumping its extra ELF program headers includes all the other information
+ * a debugger needs to easily find how the vsyscall DSO was being used.
+ */
+#define ELF_CORE_EXTRA_PHDRS (VSYSCALL32_EHDR->e_phnum)
+#define ELF_CORE_WRITE_EXTRA_PHDRS \
+do { \
+ const struct elf32_phdr *const vsyscall_phdrs = \
+ (const struct elf32_phdr *) (VSYSCALL32_BASE \
+ + VSYSCALL32_EHDR->e_phoff); \
+ int i; \
+ Elf32_Off ofs = 0; \
+ for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \
+ struct elf32_phdr phdr = vsyscall_phdrs[i]; \
+ if (phdr.p_type == PT_LOAD) { \
+ BUG_ON(ofs != 0); \
+ ofs = phdr.p_offset = offset; \
+ phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \
+ phdr.p_filesz = phdr.p_memsz; \
+ offset += phdr.p_filesz; \
+ } \
+ else \
+ phdr.p_offset += ofs; \
+ phdr.p_paddr = 0; /* match other core phdrs */ \
+ DUMP_WRITE(&phdr, sizeof(phdr)); \
+ } \
+} while (0)
+#define ELF_CORE_WRITE_EXTRA_DATA \
+do { \
+ const struct elf32_phdr *const vsyscall_phdrs = \
+ (const struct elf32_phdr *) (VSYSCALL32_BASE \
+ + VSYSCALL32_EHDR->e_phoff); \
+ int i; \
+ for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \
+ if (vsyscall_phdrs[i].p_type == PT_LOAD) \
+ DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr, \
+ PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \
+ } \
+} while (0)
+
+struct elf_siginfo
+{
+ int si_signo; /* signal number */
+ int si_code; /* extra code */
+ int si_errno; /* errno */
+};
+
+#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0)
+
+struct elf_prstatus
+{
+ struct elf_siginfo pr_info; /* Info associated with signal */
+ short pr_cursig; /* Current signal */
+ unsigned int pr_sigpend; /* Set of pending signals */
+ unsigned int pr_sighold; /* Set of held signals */
+ pid_t pr_pid;
+ pid_t pr_ppid;
+ pid_t pr_pgrp;
+ pid_t pr_sid;
+ struct compat_timeval pr_utime; /* User time */
+ struct compat_timeval pr_stime; /* System time */
+ struct compat_timeval pr_cutime; /* Cumulative user time */
+ struct compat_timeval pr_cstime; /* Cumulative system time */
+ elf_gregset_t pr_reg; /* GP registers */
+ int pr_fpvalid; /* True if math co-processor being used. */
+};
+
+#define ELF_PRARGSZ (80) /* Number of chars for args */
+
+struct elf_prpsinfo
+{
+ char pr_state; /* numeric process state */
+ char pr_sname; /* char for pr_state */
+ char pr_zomb; /* zombie */
+ char pr_nice; /* nice val */
+ unsigned int pr_flag; /* flags */
+ __u16 pr_uid;
+ __u16 pr_gid;
+ pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid;
+ /* Lots missing */
+ char pr_fname[16]; /* filename of executable */
+ char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */
+};
+
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
+#define _GET_SEG(x) \
+ ({ __u32 seg; asm("movl %%" STR(x) ",%0" : "=r"(seg)); seg; })
+
+/* Assumes current==process to be dumped */
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+ pr_reg[0] = regs->rbx; \
+ pr_reg[1] = regs->rcx; \
+ pr_reg[2] = regs->rdx; \
+ pr_reg[3] = regs->rsi; \
+ pr_reg[4] = regs->rdi; \
+ pr_reg[5] = regs->rbp; \
+ pr_reg[6] = regs->rax; \
+ pr_reg[7] = _GET_SEG(ds); \
+ pr_reg[8] = _GET_SEG(es); \
+ pr_reg[9] = _GET_SEG(fs); \
+ pr_reg[10] = _GET_SEG(gs); \
+ pr_reg[11] = regs->orig_rax; \
+ pr_reg[12] = regs->rip; \
+ pr_reg[13] = regs->cs; \
+ pr_reg[14] = regs->eflags; \
+ pr_reg[15] = regs->rsp; \
+ pr_reg[16] = regs->ss;
+
+#define user user32
+
+#define __ASM_X86_64_ELF_H 1
+#define elf_read_implies_exec(ex, have_pt_gnu_stack) (!(have_pt_gnu_stack))
+//#include <asm/ia32.h>
+#include <linux/elf.h>
+
+typedef struct user_i387_ia32_struct elf_fpregset_t;
+typedef struct user32_fxsr_struct elf_fpxregset_t;
+
+
+static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
+{
+ ELF_CORE_COPY_REGS((*elfregs), regs)
+}
+
+static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
+{
+ struct pt_regs *pp = (struct pt_regs *)(t->thread.rsp0);
+ --pp;
+ ELF_CORE_COPY_REGS((*elfregs), pp);
+ /* fix wrong segments */
+ (*elfregs)[7] = t->thread.ds;
+ (*elfregs)[9] = t->thread.fsindex;
+ (*elfregs)[10] = t->thread.gsindex;
+ (*elfregs)[8] = t->thread.es;
+ return 1;
+}
+
+static inline int
+elf_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, elf_fpregset_t *fpu)
+{
+ struct _fpstate_ia32 *fpstate = (void*)fpu;
+ mm_segment_t oldfs = get_fs();
+
+ if (!tsk_used_math(tsk))
+ return 0;
+ if (!regs)
+ regs = (struct pt_regs *)tsk->thread.rsp0;
+ --regs;
+ if (tsk == current)
+ unlazy_fpu(tsk);
+ set_fs(KERNEL_DS);
+ save_i387_ia32(tsk, fpstate, regs, 1);
+ /* Correct for i386 bug. It puts the fop into the upper 16bits of
+ the tag word (like FXSAVE), not into the fcs*/
+ fpstate->cssel |= fpstate->tag & 0xffff0000;
+ set_fs(oldfs);
+ return 1;
+}
+
+#define ELF_CORE_COPY_XFPREGS 1
+static inline int
+elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
+{
+ struct pt_regs *regs = ((struct pt_regs *)(t->thread.rsp0))-1;
+ if (!tsk_used_math(t))
+ return 0;
+ if (t == current)
+ unlazy_fpu(t);
+ memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t));
+ xfpu->fcs = regs->cs;
+ xfpu->fos = t->thread.ds; /* right? */
+ return 1;
+}
+
+#undef elf_check_arch
+#define elf_check_arch(x) \
+ ((x)->e_machine == EM_386)
+
+extern int force_personality32;
+
+#define ELF_EXEC_PAGESIZE PAGE_SIZE
+#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
+#define ELF_PLATFORM ("i686")
+#define SET_PERSONALITY(ex, ibcs2) \
+do { \
+ unsigned long new_flags = 0; \
+ if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \
+ new_flags = _TIF_IA32; \
+ if ((current_thread_info()->flags & _TIF_IA32) \
+ != new_flags) \
+ set_thread_flag(TIF_ABI_PENDING); \
+ else \
+ clear_thread_flag(TIF_ABI_PENDING); \
+ /* XXX This overwrites the user set personality */ \
+ current->personality |= force_personality32; \
+} while (0)
+
+/* Override some function names */
+#define elf_format elf32_format
+
+#define init_elf_binfmt init_elf32_binfmt
+#define exit_elf_binfmt exit_elf32_binfmt
+
+#define load_elf_binary load_elf32_binary
+
+#define ELF_PLAT_INIT(r, load_addr) elf32_init(r)
+#define setup_arg_pages(bprm, stack_top, exec_stack) \
+ ia32_setup_arg_pages(bprm, stack_top, exec_stack)
+int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack);
+
+#undef start_thread
+#define start_thread(regs,new_rip,new_rsp) do { \
+ asm volatile("movl %0,%%fs" :: "r" (0)); \
+ asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \
+ load_gs_index(0); \
+ (regs)->rip = (new_rip); \
+ (regs)->rsp = (new_rsp); \
+ (regs)->eflags = 0x200; \
+ (regs)->cs = __USER32_CS; \
+ (regs)->ss = __USER32_DS; \
+ set_fs(USER_DS); \
+} while(0)
+
+
+#define elf_map elf32_map
+
+#include <linux/module.h>
+
+MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries.");
+MODULE_AUTHOR("Eric Youngdale, Andi Kleen");
+
+#undef MODULE_DESCRIPTION
+#undef MODULE_AUTHOR
+
+#define elf_addr_t __u32
+
+#undef TASK_SIZE
+#define TASK_SIZE 0xffffffff
+
+static void elf32_init(struct pt_regs *);
+
+#include "../../../fs/binfmt_elf.c"
+
+static void elf32_init(struct pt_regs *regs)
+{
+ struct task_struct *me = current;
+ regs->rdi = 0;
+ regs->rsi = 0;
+ regs->rdx = 0;
+ regs->rcx = 0;
+ regs->rax = 0;
+ regs->rbx = 0;
+ regs->rbp = 0;
+ regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
+ regs->r13 = regs->r14 = regs->r15 = 0;
+ me->thread.fs = 0;
+ me->thread.gs = 0;
+ me->thread.fsindex = 0;
+ me->thread.gsindex = 0;
+ me->thread.ds = __USER_DS;
+ me->thread.es = __USER_DS;
+}
+
+int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack)
+{
+ unsigned long stack_base;
+ struct vm_area_struct *mpnt;
+ struct mm_struct *mm = current->mm;
+ int i, ret;
+
+ stack_base = IA32_STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE;
+ mm->arg_start = bprm->p + stack_base;
+
+ bprm->p += stack_base;
+ if (bprm->loader)
+ bprm->loader += stack_base;
+ bprm->exec += stack_base;
+
+ mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+ if (!mpnt)
+ return -ENOMEM;
+
+ if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
+ kmem_cache_free(vm_area_cachep, mpnt);
+ return -ENOMEM;
+ }
+
+ memset(mpnt, 0, sizeof(*mpnt));
+
+ down_write(&mm->mmap_sem);
+ {
+ mpnt->vm_mm = mm;
+ mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
+ mpnt->vm_end = IA32_STACK_TOP;
+ if (executable_stack == EXSTACK_ENABLE_X)
+ mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC;
+ else if (executable_stack == EXSTACK_DISABLE_X)
+ mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
+ else
+ mpnt->vm_flags = VM_STACK_FLAGS;
+ mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ?
+ PAGE_COPY_EXEC : PAGE_COPY;
+ if ((ret = insert_vm_struct(mm, mpnt))) {
+ up_write(&mm->mmap_sem);
+ kmem_cache_free(vm_area_cachep, mpnt);
+ return ret;
+ }
+ mm->stack_vm = mm->total_vm = vma_pages(mpnt);
+ }
+
+ for (i = 0 ; i < MAX_ARG_PAGES ; i++) {
+ struct page *page = bprm->page[i];
+ if (page) {
+ bprm->page[i] = NULL;
+ install_arg_page(mpnt, page, stack_base);
+ }
+ stack_base += PAGE_SIZE;
+ }
+ up_write(&mm->mmap_sem);
+
+ return 0;
+}
+
+static unsigned long
+elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type)
+{
+ unsigned long map_addr;
+ struct task_struct *me = current;
+
+ down_write(&me->mm->mmap_sem);
+ map_addr = do_mmap(filep, ELF_PAGESTART(addr),
+ eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot,
+ type,
+ eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr));
+ up_write(&me->mm->mmap_sem);
+ return(map_addr);
+}
+
+#ifdef CONFIG_SYSCTL
+/* Register vsyscall32 into the ABI table */
+#include <linux/sysctl.h>
+
+static ctl_table abi_table2[] = {
+ { 99, "vsyscall32", &sysctl_vsyscall32, sizeof(int), 0644, NULL,
+ proc_dointvec },
+ { 0, }
+};
+
+static ctl_table abi_root_table2[] = {
+ { .ctl_name = CTL_ABI, .procname = "abi", .mode = 0555,
+ .child = abi_table2 },
+ { 0 },
+};
+
+static __init int ia32_binfmt_init(void)
+{
+ register_sysctl_table(abi_root_table2, 1);
+ return 0;
+}
+__initcall(ia32_binfmt_init);
+#endif
diff --git a/arch/x86_64/ia32/ia32_ioctl.c b/arch/x86_64/ia32/ia32_ioctl.c
new file mode 100644
index 000000000000..d259f8a6f811
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_ioctl.c
@@ -0,0 +1,201 @@
+/* $Id: ia32_ioctl.c,v 1.25 2002/10/11 07:17:06 ak Exp $
+ * ioctl32.c: Conversion between 32bit and 64bit native ioctls.
+ *
+ * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
+ * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
+ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * ioctls.
+ */
+
+#define INCLUDES
+#include <linux/syscalls.h>
+#include "compat_ioctl.c"
+#include <asm/mtrr.h>
+#include <asm/ia32.h>
+
+#define CODE
+#include "compat_ioctl.c"
+
+#ifndef TIOCGDEV
+#define TIOCGDEV _IOR('T',0x32, unsigned int)
+#endif
+static int tiocgdev(unsigned fd, unsigned cmd, unsigned int __user *ptr)
+{
+
+ struct file *file = fget(fd);
+ struct tty_struct *real_tty;
+
+ if (!file)
+ return -EBADF;
+ if (file->f_op->ioctl != tty_ioctl)
+ return -EINVAL;
+ real_tty = (struct tty_struct *)file->private_data;
+ if (!real_tty)
+ return -EINVAL;
+ return put_user(new_encode_dev(tty_devnum(real_tty)), ptr);
+}
+
+#define RTC_IRQP_READ32 _IOR('p', 0x0b, unsigned int) /* Read IRQ rate */
+#define RTC_IRQP_SET32 _IOW('p', 0x0c, unsigned int) /* Set IRQ rate */
+#define RTC_EPOCH_READ32 _IOR('p', 0x0d, unsigned) /* Read epoch */
+#define RTC_EPOCH_SET32 _IOW('p', 0x0e, unsigned) /* Set epoch */
+
+static int rtc32_ioctl(unsigned fd, unsigned cmd, unsigned long arg)
+{
+ unsigned long val;
+ mm_segment_t oldfs = get_fs();
+ int ret;
+
+ switch (cmd) {
+ case RTC_IRQP_READ32:
+ set_fs(KERNEL_DS);
+ ret = sys_ioctl(fd, RTC_IRQP_READ, (unsigned long)&val);
+ set_fs(oldfs);
+ if (!ret)
+ ret = put_user(val, (unsigned int __user *) arg);
+ return ret;
+
+ case RTC_IRQP_SET32:
+ cmd = RTC_IRQP_SET;
+ break;
+
+ case RTC_EPOCH_READ32:
+ set_fs(KERNEL_DS);
+ ret = sys_ioctl(fd, RTC_EPOCH_READ, (unsigned long) &val);
+ set_fs(oldfs);
+ if (!ret)
+ ret = put_user(val, (unsigned int __user *) arg);
+ return ret;
+
+ case RTC_EPOCH_SET32:
+ cmd = RTC_EPOCH_SET;
+ break;
+ }
+ return sys_ioctl(fd,cmd,arg);
+}
+
+/* /proc/mtrr ioctls */
+
+
+struct mtrr_sentry32
+{
+ compat_ulong_t base; /* Base address */
+ compat_uint_t size; /* Size of region */
+ compat_uint_t type; /* Type of region */
+};
+
+struct mtrr_gentry32
+{
+ compat_ulong_t regnum; /* Register number */
+ compat_uint_t base; /* Base address */
+ compat_uint_t size; /* Size of region */
+ compat_uint_t type; /* Type of region */
+};
+
+#define MTRR_IOCTL_BASE 'M'
+
+#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32)
+#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32)
+#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32)
+#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32)
+#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32)
+#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32)
+#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
+#define MTRRIOC32_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
+
+
+static int mtrr_ioctl32(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+ struct mtrr_gentry g;
+ struct mtrr_sentry s;
+ int get = 0, err = 0;
+ struct mtrr_gentry32 __user *g32 = (struct mtrr_gentry32 __user *)arg;
+ mm_segment_t oldfs = get_fs();
+
+ switch (cmd) {
+#define SET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; break
+#define GET(x) case MTRRIOC32_ ## x ## _ENTRY: cmd = MTRRIOC_ ## x ## _ENTRY; get=1; break
+ SET(ADD);
+ SET(SET);
+ SET(DEL);
+ GET(GET);
+ SET(KILL);
+ SET(ADD_PAGE);
+ SET(SET_PAGE);
+ SET(DEL_PAGE);
+ GET(GET_PAGE);
+ SET(KILL_PAGE);
+ }
+
+ if (get) {
+ err = get_user(g.regnum, &g32->regnum);
+ err |= get_user(g.base, &g32->base);
+ err |= get_user(g.size, &g32->size);
+ err |= get_user(g.type, &g32->type);
+
+ arg = (unsigned long)&g;
+ } else {
+ struct mtrr_sentry32 __user *s32 = (struct mtrr_sentry32 __user *)arg;
+ err = get_user(s.base, &s32->base);
+ err |= get_user(s.size, &s32->size);
+ err |= get_user(s.type, &s32->type);
+
+ arg = (unsigned long)&s;
+ }
+ if (err) return err;
+
+ set_fs(KERNEL_DS);
+ err = sys_ioctl(fd, cmd, arg);
+ set_fs(oldfs);
+
+ if (!err && get) {
+ err = put_user(g.base, &g32->base);
+ err |= put_user(g.size, &g32->size);
+ err |= put_user(g.regnum, &g32->regnum);
+ err |= put_user(g.type, &g32->type);
+ }
+ return err;
+}
+
+#define HANDLE_IOCTL(cmd,handler) { (cmd), (ioctl_trans_handler_t)(handler) },
+#define COMPATIBLE_IOCTL(cmd) HANDLE_IOCTL(cmd,sys_ioctl)
+
+struct ioctl_trans ioctl_start[] = {
+#include <linux/compat_ioctl.h>
+#define DECLARES
+#include "compat_ioctl.c"
+COMPATIBLE_IOCTL(HDIO_SET_KEEPSETTINGS)
+COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
+COMPATIBLE_IOCTL(BLKRASET)
+COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */
+COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */
+COMPATIBLE_IOCTL(FIOQSIZE)
+
+/* And these ioctls need translation */
+HANDLE_IOCTL(TIOCGDEV, tiocgdev)
+/* realtime device */
+HANDLE_IOCTL(RTC_IRQP_READ, rtc32_ioctl)
+HANDLE_IOCTL(RTC_IRQP_READ32,rtc32_ioctl)
+HANDLE_IOCTL(RTC_IRQP_SET32, rtc32_ioctl)
+HANDLE_IOCTL(RTC_EPOCH_READ32, rtc32_ioctl)
+HANDLE_IOCTL(RTC_EPOCH_SET32, rtc32_ioctl)
+/* take care of sizeof(sizeof()) breakage */
+/* mtrr */
+HANDLE_IOCTL(MTRRIOC32_ADD_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_SET_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_DEL_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_GET_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_KILL_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_ADD_PAGE_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_SET_PAGE_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_DEL_PAGE_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_GET_PAGE_ENTRY, mtrr_ioctl32)
+HANDLE_IOCTL(MTRRIOC32_KILL_PAGE_ENTRY, mtrr_ioctl32)
+};
+
+int ioctl_table_size = ARRAY_SIZE(ioctl_start);
+
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
new file mode 100644
index 000000000000..fbd09b5126ce
--- /dev/null
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -0,0 +1,621 @@
+/*
+ * linux/arch/x86_64/ia32/ia32_signal.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
+ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
+ * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen
+ *
+ * $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/ptrace.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/compat.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include <asm/ia32.h>
+#include <asm/ptrace.h>
+#include <asm/ia32_unistd.h>
+#include <asm/user32.h>
+#include <asm/sigcontext32.h>
+#include <asm/fpu32.h>
+#include <asm/proto.h>
+#include <asm/vsyscall32.h>
+
+#define DEBUG_SIG 0
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+{
+ int err;
+ if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+ return -EFAULT;
+
+ /* If you change siginfo_t structure, please make sure that
+ this code is fixed accordingly.
+ It should never copy any pad contained in the structure
+ to avoid security leaks, but must copy the generic
+ 3 ints plus the relevant union member. */
+ err = __put_user(from->si_signo, &to->si_signo);
+ err |= __put_user(from->si_errno, &to->si_errno);
+ err |= __put_user((short)from->si_code, &to->si_code);
+
+ if (from->si_code < 0) {
+ err |= __put_user(from->si_pid, &to->si_pid);
+ err |= __put_user(from->si_uid, &to->si_uid);
+ err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr);
+ } else {
+ /* First 32bits of unions are always present:
+ * si_pid === si_band === si_tid === si_addr(LS half) */
+ err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]);
+ switch (from->si_code >> 16) {
+ case __SI_FAULT >> 16:
+ break;
+ case __SI_CHLD >> 16:
+ err |= __put_user(from->si_utime, &to->si_utime);
+ err |= __put_user(from->si_stime, &to->si_stime);
+ err |= __put_user(from->si_status, &to->si_status);
+ /* FALL THROUGH */
+ default:
+ case __SI_KILL >> 16:
+ err |= __put_user(from->si_uid, &to->si_uid);
+ break;
+ case __SI_POLL >> 16:
+ err |= __put_user(from->si_fd, &to->si_fd);
+ break;
+ case __SI_TIMER >> 16:
+ err |= __put_user(from->si_overrun, &to->si_overrun);
+ err |= __put_user(ptr_to_compat(from->si_ptr),
+ &to->si_ptr);
+ break;
+ case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
+ case __SI_MESGQ >> 16:
+ err |= __put_user(from->si_uid, &to->si_uid);
+ err |= __put_user(from->si_int, &to->si_int);
+ break;
+ }
+ }
+ return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+ int err;
+ u32 ptr32;
+ if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t)))
+ return -EFAULT;
+
+ err = __get_user(to->si_signo, &from->si_signo);
+ err |= __get_user(to->si_errno, &from->si_errno);
+ err |= __get_user(to->si_code, &from->si_code);
+
+ err |= __get_user(to->si_pid, &from->si_pid);
+ err |= __get_user(to->si_uid, &from->si_uid);
+ err |= __get_user(ptr32, &from->si_ptr);
+ to->si_ptr = compat_ptr(ptr32);
+
+ return err;
+}
+
+asmlinkage long
+sys32_sigsuspend(int history0, int history1, old_sigset_t mask,
+ struct pt_regs *regs)
+{
+ sigset_t saveset;
+
+ mask &= _BLOCKABLE;
+ spin_lock_irq(&current->sighand->siglock);
+ saveset = current->blocked;
+ siginitset(&current->blocked, mask);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ regs->rax = -EINTR;
+ while (1) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ if (do_signal(regs, &saveset))
+ return -EINTR;
+ }
+}
+
+asmlinkage long
+sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,
+ stack_ia32_t __user *uoss_ptr,
+ struct pt_regs *regs)
+{
+ stack_t uss,uoss;
+ int ret;
+ mm_segment_t seg;
+ if (uss_ptr) {
+ u32 ptr;
+ memset(&uss,0,sizeof(stack_t));
+ if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) ||
+ __get_user(ptr, &uss_ptr->ss_sp) ||
+ __get_user(uss.ss_flags, &uss_ptr->ss_flags) ||
+ __get_user(uss.ss_size, &uss_ptr->ss_size))
+ return -EFAULT;
+ uss.ss_sp = compat_ptr(ptr);
+ }
+ seg = get_fs();
+ set_fs(KERNEL_DS);
+ ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp);
+ set_fs(seg);
+ if (ret >= 0 && uoss_ptr) {
+ if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) ||
+ __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
+ __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
+ __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ ret = -EFAULT;
+ }
+ return ret;
+}
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+struct sigframe
+{
+ u32 pretcode;
+ int sig;
+ struct sigcontext_ia32 sc;
+ struct _fpstate_ia32 fpstate;
+ unsigned int extramask[_COMPAT_NSIG_WORDS-1];
+ char retcode[8];
+};
+
+struct rt_sigframe
+{
+ u32 pretcode;
+ int sig;
+ u32 pinfo;
+ u32 puc;
+ compat_siginfo_t info;
+ struct ucontext_ia32 uc;
+ struct _fpstate_ia32 fpstate;
+ char retcode[8];
+};
+
+static int
+ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax)
+{
+ unsigned int err = 0;
+
+ /* Always make any pending restarted system calls return -EINTR */
+ current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+#if DEBUG_SIG
+ printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n",
+ sc, sc->err, sc->eip, sc->cs, sc->eflags);
+#endif
+#define COPY(x) { \
+ unsigned int reg; \
+ err |= __get_user(reg, &sc->e ##x); \
+ regs->r ## x = reg; \
+}
+
+#define RELOAD_SEG(seg,mask) \
+ { unsigned int cur; \
+ unsigned short pre; \
+ err |= __get_user(pre, &sc->seg); \
+ asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \
+ pre |= mask; \
+ if (pre != cur) loadsegment(seg,pre); }
+
+ /* Reload fs and gs if they have changed in the signal handler.
+ This does not handle long fs/gs base changes in the handler, but
+ does not clobber them at least in the normal case. */
+
+ {
+ unsigned gs, oldgs;
+ err |= __get_user(gs, &sc->gs);
+ gs |= 3;
+ asm("movl %%gs,%0" : "=r" (oldgs));
+ if (gs != oldgs)
+ load_gs_index(gs);
+ }
+ RELOAD_SEG(fs,3);
+ RELOAD_SEG(ds,3);
+ RELOAD_SEG(es,3);
+
+ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+ COPY(dx); COPY(cx); COPY(ip);
+ /* Don't touch extended registers */
+
+ err |= __get_user(regs->cs, &sc->cs);
+ regs->cs |= 3;
+ err |= __get_user(regs->ss, &sc->ss);
+ regs->ss |= 3;
+
+ {
+ unsigned int tmpflags;
+ err |= __get_user(tmpflags, &sc->eflags);
+ regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5);
+ regs->orig_rax = -1; /* disable syscall checks */
+ }
+
+ {
+ u32 tmp;
+ struct _fpstate_ia32 __user * buf;
+ err |= __get_user(tmp, &sc->fpstate);
+ buf = compat_ptr(tmp);
+ if (buf) {
+ if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+ goto badframe;
+ err |= restore_i387_ia32(current, buf, 0);
+ } else {
+ struct task_struct *me = current;
+ if (used_math()) {
+ clear_fpu(me);
+ clear_used_math();
+ }
+ }
+ }
+
+ {
+ u32 tmp;
+ err |= __get_user(tmp, &sc->eax);
+ *peax = tmp;
+ }
+ return err;
+
+badframe:
+ return 1;
+}
+
+asmlinkage long sys32_sigreturn(struct pt_regs *regs)
+{
+ struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8);
+ sigset_t set;
+ unsigned int eax;
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__get_user(set.sig[0], &frame->sc.oldmask)
+ || (_COMPAT_NSIG_WORDS > 1
+ && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask,
+ sizeof(frame->extramask))))
+ goto badframe;
+
+ sigdelsetmask(&set, ~_BLOCKABLE);
+ spin_lock_irq(&current->sighand->siglock);
+ current->blocked = set;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (ia32_restore_sigcontext(regs, &frame->sc, &eax))
+ goto badframe;
+ return eax;
+
+badframe:
+ signal_fault(regs, frame, "32bit sigreturn");
+ return 0;
+}
+
+asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
+{
+ struct rt_sigframe __user *frame;
+ sigset_t set;
+ unsigned int eax;
+ struct pt_regs tregs;
+
+ frame = (struct rt_sigframe __user *)(regs->rsp - 4);
+
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+ goto badframe;
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+ goto badframe;
+
+ sigdelsetmask(&set, ~_BLOCKABLE);
+ spin_lock_irq(&current->sighand->siglock);
+ current->blocked = set;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+ goto badframe;
+
+ tregs = *regs;
+ if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT)
+ goto badframe;
+
+ return eax;
+
+badframe:
+ signal_fault(regs,frame,"32bit rt sigreturn");
+ return 0;
+}
+
+/*
+ * Set up a signal frame.
+ */
+
+static int
+ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate,
+ struct pt_regs *regs, unsigned int mask)
+{
+ int tmp, err = 0;
+ u32 eflags;
+
+ tmp = 0;
+ __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+ err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+ __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+ err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+ __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp));
+ err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
+ __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp));
+ err |= __put_user(tmp, (unsigned int __user *)&sc->es);
+
+ err |= __put_user((u32)regs->rdi, &sc->edi);
+ err |= __put_user((u32)regs->rsi, &sc->esi);
+ err |= __put_user((u32)regs->rbp, &sc->ebp);
+ err |= __put_user((u32)regs->rsp, &sc->esp);
+ err |= __put_user((u32)regs->rbx, &sc->ebx);
+ err |= __put_user((u32)regs->rdx, &sc->edx);
+ err |= __put_user((u32)regs->rcx, &sc->ecx);
+ err |= __put_user((u32)regs->rax, &sc->eax);
+ err |= __put_user((u32)regs->cs, &sc->cs);
+ err |= __put_user((u32)regs->ss, &sc->ss);
+ err |= __put_user(current->thread.trap_no, &sc->trapno);
+ err |= __put_user(current->thread.error_code, &sc->err);
+ err |= __put_user((u32)regs->rip, &sc->eip);
+ eflags = regs->eflags;
+ if (current->ptrace & PT_PTRACED)
+ eflags &= ~TF_MASK;
+ err |= __put_user((u32)eflags, &sc->eflags);
+ err |= __put_user((u32)regs->rsp, &sc->esp_at_signal);
+
+ tmp = save_i387_ia32(current, fpstate, regs, 0);
+ if (tmp < 0)
+ err = -EFAULT;
+ else {
+ clear_used_math();
+ stts();
+ err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
+ &sc->fpstate);
+ }
+
+ /* non-iBCS2 extensions.. */
+ err |= __put_user(mask, &sc->oldmask);
+ err |= __put_user(current->thread.cr2, &sc->cr2);
+
+ return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+static void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+{
+ unsigned long rsp;
+
+ /* Default to using normal stack */
+ rsp = regs->rsp;
+
+ /* This is the X/Open sanctioned signal stack switching. */
+ if (ka->sa.sa_flags & SA_ONSTACK) {
+ if (sas_ss_flags(rsp) == 0)
+ rsp = current->sas_ss_sp + current->sas_ss_size;
+ }
+
+ /* This is the legacy signal stack switching. */
+ else if ((regs->ss & 0xffff) != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer) {
+ rsp = (unsigned long) ka->sa.sa_restorer;
+ }
+
+ return (void __user *)((rsp - frame_size) & -8UL);
+}
+
+void ia32_setup_frame(int sig, struct k_sigaction *ka,
+ compat_sigset_t *set, struct pt_regs * regs)
+{
+ struct sigframe __user *frame;
+ int err = 0;
+
+ frame = get_sigframe(ka, regs, sizeof(*frame));
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ goto give_sigsegv;
+
+ {
+ struct exec_domain *ed = current_thread_info()->exec_domain;
+ err |= __put_user((ed
+ && ed->signal_invmap
+ && sig < 32
+ ? ed->signal_invmap[sig]
+ : sig),
+ &frame->sig);
+ }
+ if (err)
+ goto give_sigsegv;
+
+ err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs,
+ set->sig[0]);
+ if (err)
+ goto give_sigsegv;
+
+ if (_COMPAT_NSIG_WORDS > 1) {
+ err |= __copy_to_user(frame->extramask, &set->sig[1],
+ sizeof(frame->extramask));
+ }
+ if (err)
+ goto give_sigsegv;
+
+ /* Return stub is in 32bit vsyscall page */
+ {
+ void __user *restorer = VSYSCALL32_SIGRETURN;
+ if (ka->sa.sa_flags & SA_RESTORER)
+ restorer = ka->sa.sa_restorer;
+ err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
+ }
+ /* These are actually not used anymore, but left because some
+ gdb versions depend on them as a marker. */
+ {
+ /* copy_to_user optimizes that into a single 8 byte store */
+ static const struct {
+ u16 poplmovl;
+ u32 val;
+ u16 int80;
+ u16 pad;
+ } __attribute__((packed)) code = {
+ 0xb858, /* popl %eax ; movl $...,%eax */
+ __NR_ia32_sigreturn,
+ 0x80cd, /* int $0x80 */
+ 0,
+ };
+ err |= __copy_to_user(frame->retcode, &code, 8);
+ }
+ if (err)
+ goto give_sigsegv;
+
+ /* Set up registers for signal handler */
+ regs->rsp = (unsigned long) frame;
+ regs->rip = (unsigned long) ka->sa.sa_handler;
+
+ asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
+ asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
+
+ regs->cs = __USER32_CS;
+ regs->ss = __USER32_DS;
+
+ set_fs(USER_DS);
+ if (regs->eflags & TF_MASK) {
+ if (current->ptrace & PT_PTRACED) {
+ ptrace_notify(SIGTRAP);
+ } else {
+ regs->eflags &= ~TF_MASK;
+ }
+ }
+
+#if DEBUG_SIG
+ printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+ current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+
+ return;
+
+give_sigsegv:
+ force_sigsegv(sig, current);
+}
+
+void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+ compat_sigset_t *set, struct pt_regs * regs)
+{
+ struct rt_sigframe __user *frame;
+ int err = 0;
+
+ frame = get_sigframe(ka, regs, sizeof(*frame));
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ goto give_sigsegv;
+
+ {
+ struct exec_domain *ed = current_thread_info()->exec_domain;
+ err |= __put_user((ed
+ && ed->signal_invmap
+ && sig < 32
+ ? ed->signal_invmap[sig]
+ : sig),
+ &frame->sig);
+ }
+ err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
+ err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
+ err |= copy_siginfo_to_user32(&frame->info, info);
+ if (err)
+ goto give_sigsegv;
+
+ /* Create the ucontext. */
+ err |= __put_user(0, &frame->uc.uc_flags);
+ err |= __put_user(0, &frame->uc.uc_link);
+ err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+ err |= __put_user(sas_ss_flags(regs->rsp),
+ &frame->uc.uc_stack.ss_flags);
+ err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+ err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+ regs, set->sig[0]);
+ err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+ if (err)
+ goto give_sigsegv;
+
+
+ {
+ void __user *restorer = VSYSCALL32_RTSIGRETURN;
+ if (ka->sa.sa_flags & SA_RESTORER)
+ restorer = ka->sa.sa_restorer;
+ err |= __put_user(ptr_to_compat(restorer), &frame->pretcode);
+ }
+
+ /* This is movl $,%eax ; int $0x80 */
+ /* Not actually used anymore, but left because some gdb versions
+ need it. */
+ {
+ /* __copy_to_user optimizes that into a single 8 byte store */
+ static const struct {
+ u8 movl;
+ u32 val;
+ u16 int80;
+ u16 pad;
+ u8 pad2;
+ } __attribute__((packed)) code = {
+ 0xb8,
+ __NR_ia32_rt_sigreturn,
+ 0x80cd,
+ 0,
+ };
+ err |= __copy_to_user(frame->retcode, &code, 8);
+ }
+ if (err)
+ goto give_sigsegv;
+
+ /* Set up registers for signal handler */
+ regs->rsp = (unsigned long) frame;
+ regs->rip = (unsigned long) ka->sa.sa_handler;
+
+ asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
+ asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
+
+ regs->cs = __USER32_CS;
+ regs->ss = __USER32_DS;
+
+ set_fs(USER_DS);
+ if (regs->eflags & TF_MASK) {
+ if (current->ptrace & PT_PTRACED) {
+ ptrace_notify(SIGTRAP);
+ } else {
+ regs->eflags &= ~TF_MASK;
+ }
+ }
+
+#if DEBUG_SIG
+ printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+ current->comm, current->pid, frame, regs->rip, frame->pretcode);
+#endif
+
+ return;
+
+give_sigsegv:
+ force_sigsegv(sig, current);
+}
+
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
new file mode 100644
index 000000000000..f3ca0db85b5b
--- /dev/null
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -0,0 +1,602 @@
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/offset.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/vsyscall32.h>
+#include <linux/linkage.h>
+
+ .macro IA32_ARG_FIXUP noebp=0
+ movl %edi,%r8d
+ .if \noebp
+ .else
+ movl %ebp,%r9d
+ .endif
+ xchg %ecx,%esi
+ movl %ebx,%edi
+ movl %edx,%edx /* zero extension */
+ .endm
+
+ /* clobbers %eax */
+ .macro CLEAR_RREGS
+ xorl %eax,%eax
+ movq %rax,R11(%rsp)
+ movq %rax,R10(%rsp)
+ movq %rax,R9(%rsp)
+ movq %rax,R8(%rsp)
+ .endm
+
+/*
+ * 32bit SYSENTER instruction entry.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp user stack
+ * 0(%ebp) Arg6
+ *
+ * Interrupts off.
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(ia32_sysenter_target)
+ CFI_STARTPROC
+ swapgs
+ movq %gs:pda_kernelstack, %rsp
+ addq $(PDA_STACKOFFSET),%rsp
+ sti
+ movl %ebp,%ebp /* zero extension */
+ pushq $__USER32_DS
+ pushq %rbp
+ pushfq
+ movl $VSYSCALL32_SYSEXIT, %r10d
+ pushq $__USER32_CS
+ movl %eax, %eax
+ pushq %r10
+ pushq %rax
+ cld
+ SAVE_ARGS 0,0,1
+ /* no need to do an access_ok check here because rbp has been
+ 32bit zero extended */
+1: movl (%rbp),%r9d
+ .section __ex_table,"a"
+ .quad 1b,ia32_badarg
+ .previous
+ GET_THREAD_INFO(%r10)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ jnz sysenter_tracesys
+sysenter_do_call:
+ cmpl $(IA32_NR_syscalls),%eax
+ jae ia32_badsys
+ IA32_ARG_FIXUP 1
+ call *ia32_sys_call_table(,%rax,8)
+ movq %rax,RAX-ARGOFFSET(%rsp)
+ GET_THREAD_INFO(%r10)
+ cli
+ testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
+ jnz int_ret_from_sys_call
+ /* clear IF, that popfq doesn't enable interrupts early */
+ andl $~0x200,EFLAGS-R11(%rsp)
+ RESTORE_ARGS 1,24,1,1,1,1
+ popfq
+ popq %rcx /* User %esp */
+ movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */
+ swapgs
+ sti /* sti only takes effect after the next instruction */
+ /* sysexit */
+ .byte 0xf, 0x35
+
+sysenter_tracesys:
+ SAVE_REST
+ CLEAR_RREGS
+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ movl %ebp, %ebp
+ /* no need to do an access_ok check here because rbp has been
+ 32bit zero extended */
+1: movl (%rbp),%r9d
+ .section __ex_table,"a"
+ .quad 1b,ia32_badarg
+ .previous
+ jmp sysenter_do_call
+ CFI_ENDPROC
+
+/*
+ * 32bit SYSCALL instruction entry.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx return EIP
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
+ * %esp user stack
+ * 0(%esp) Arg6
+ *
+ * Interrupts off.
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. Set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(ia32_cstar_target)
+ CFI_STARTPROC
+ swapgs
+ movl %esp,%r8d
+ movq %gs:pda_kernelstack,%rsp
+ sti
+ SAVE_ARGS 8,1,1
+ movl %eax,%eax /* zero extension */
+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ movq %rcx,RIP-ARGOFFSET(%rsp)
+ movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+ movl %ebp,%ecx
+ movq $__USER32_CS,CS-ARGOFFSET(%rsp)
+ movq $__USER32_DS,SS-ARGOFFSET(%rsp)
+ movq %r11,EFLAGS-ARGOFFSET(%rsp)
+ movq %r8,RSP-ARGOFFSET(%rsp)
+ /* no need to do an access_ok check here because r8 has been
+ 32bit zero extended */
+ /* hardware stack frame is complete now */
+1: movl (%r8),%r9d
+ .section __ex_table,"a"
+ .quad 1b,ia32_badarg
+ .previous
+ GET_THREAD_INFO(%r10)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ jnz cstar_tracesys
+cstar_do_call:
+ cmpl $IA32_NR_syscalls,%eax
+ jae ia32_badsys
+ IA32_ARG_FIXUP 1
+ call *ia32_sys_call_table(,%rax,8)
+ movq %rax,RAX-ARGOFFSET(%rsp)
+ GET_THREAD_INFO(%r10)
+ cli
+ testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
+ jnz int_ret_from_sys_call
+ RESTORE_ARGS 1,-ARG_SKIP,1,1,1
+ movl RIP-ARGOFFSET(%rsp),%ecx
+ movl EFLAGS-ARGOFFSET(%rsp),%r11d
+ movl RSP-ARGOFFSET(%rsp),%esp
+ swapgs
+ sysretl
+
+cstar_tracesys:
+ SAVE_REST
+ CLEAR_RREGS
+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ movl RSP-ARGOFFSET(%rsp), %r8d
+ /* no need to do an access_ok check here because r8 has been
+ 32bit zero extended */
+1: movl (%r8),%r9d
+ .section __ex_table,"a"
+ .quad 1b,ia32_badarg
+ .previous
+ jmp cstar_do_call
+
+ia32_badarg:
+ movq $-EFAULT,%rax
+ jmp ia32_sysret
+ CFI_ENDPROC
+
+/*
+ * Emulated IA32 system calls via int 0x80.
+ *
+ * Arguments:
+ * %eax System call number.
+ * %ebx Arg1
+ * %ecx Arg2
+ * %edx Arg3
+ * %esi Arg4
+ * %edi Arg5
+ * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except %eax must be saved (but ptrace may violate that)
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
+
+ENTRY(ia32_syscall)
+ CFI_STARTPROC
+ swapgs
+ sti
+ movl %eax,%eax
+ pushq %rax
+ cld
+ /* note the registers are not zero extended to the sf.
+ this could be a problem. */
+ SAVE_ARGS 0,0,1
+ GET_THREAD_INFO(%r10)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
+ jnz ia32_tracesys
+ia32_do_syscall:
+ cmpl $(IA32_NR_syscalls),%eax
+ jae ia32_badsys
+ IA32_ARG_FIXUP
+ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
+ia32_sysret:
+ movq %rax,RAX-ARGOFFSET(%rsp)
+ jmp int_ret_from_sys_call
+
+ia32_tracesys:
+ SAVE_REST
+ movq $-ENOSYS,RAX(%rsp) /* really needed? */
+ movq %rsp,%rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ jmp ia32_do_syscall
+
+ia32_badsys:
+ movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+ jmp int_ret_from_sys_call
+
+ni_syscall:
+ movq %rax,%rdi
+ jmp sys32_ni_syscall
+
+quiet_ni_syscall:
+ movq $-ENOSYS,%rax
+ ret
+ CFI_ENDPROC
+
+ .macro PTREGSCALL label, func, arg
+ .globl \label
+\label:
+ leaq \func(%rip),%rax
+ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
+ jmp ia32_ptregs_common
+ .endm
+
+ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi
+ PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi
+ PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx
+ PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx
+ PTREGSCALL stub32_execve, sys32_execve, %rcx
+ PTREGSCALL stub32_fork, sys_fork, %rdi
+ PTREGSCALL stub32_clone, sys32_clone, %rdx
+ PTREGSCALL stub32_vfork, sys_vfork, %rdi
+ PTREGSCALL stub32_iopl, sys_iopl, %rsi
+ PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+
+ENTRY(ia32_ptregs_common)
+ CFI_STARTPROC
+ popq %r11
+ SAVE_REST
+ call *%rax
+ RESTORE_REST
+ jmp ia32_sysret /* misbalances the return cache */
+ CFI_ENDPROC
+
+ .data
+ .align 8
+ .globl ia32_sys_call_table
+ia32_sys_call_table:
+ .quad sys_restart_syscall
+ .quad sys_exit
+ .quad stub32_fork
+ .quad sys_read
+ .quad sys_write
+ .quad sys32_open /* 5 */
+ .quad sys_close
+ .quad sys32_waitpid
+ .quad sys_creat
+ .quad sys_link
+ .quad sys_unlink /* 10 */
+ .quad stub32_execve
+ .quad sys_chdir
+ .quad compat_sys_time
+ .quad sys_mknod
+ .quad sys_chmod /* 15 */
+ .quad sys_lchown16
+ .quad quiet_ni_syscall /* old break syscall holder */
+ .quad sys_stat
+ .quad sys32_lseek
+ .quad sys_getpid /* 20 */
+ .quad compat_sys_mount /* mount */
+ .quad sys_oldumount /* old_umount */
+ .quad sys_setuid16
+ .quad sys_getuid16
+ .quad compat_sys_stime /* stime */ /* 25 */
+ .quad sys32_ptrace /* ptrace */
+ .quad sys_alarm
+ .quad sys_fstat /* (old)fstat */
+ .quad sys_pause
+ .quad compat_sys_utime /* 30 */
+ .quad quiet_ni_syscall /* old stty syscall holder */
+ .quad quiet_ni_syscall /* old gtty syscall holder */
+ .quad sys_access
+ .quad sys_nice
+ .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */
+ .quad sys_sync
+ .quad sys32_kill
+ .quad sys_rename
+ .quad sys_mkdir
+ .quad sys_rmdir /* 40 */
+ .quad sys_dup
+ .quad sys32_pipe
+ .quad compat_sys_times
+ .quad quiet_ni_syscall /* old prof syscall holder */
+ .quad sys_brk /* 45 */
+ .quad sys_setgid16
+ .quad sys_getgid16
+ .quad sys_signal
+ .quad sys_geteuid16
+ .quad sys_getegid16 /* 50 */
+ .quad sys_acct
+ .quad sys_umount /* new_umount */
+ .quad quiet_ni_syscall /* old lock syscall holder */
+ .quad compat_sys_ioctl
+ .quad compat_sys_fcntl64 /* 55 */
+ .quad quiet_ni_syscall /* old mpx syscall holder */
+ .quad sys_setpgid
+ .quad quiet_ni_syscall /* old ulimit syscall holder */
+ .quad sys32_olduname
+ .quad sys_umask /* 60 */
+ .quad sys_chroot
+ .quad sys32_ustat
+ .quad sys_dup2
+ .quad sys_getppid
+ .quad sys_getpgrp /* 65 */
+ .quad sys_setsid
+ .quad sys32_sigaction
+ .quad sys_sgetmask
+ .quad sys_ssetmask
+ .quad sys_setreuid16 /* 70 */
+ .quad sys_setregid16
+ .quad stub32_sigsuspend
+ .quad compat_sys_sigpending
+ .quad sys_sethostname
+ .quad compat_sys_setrlimit /* 75 */
+ .quad compat_sys_old_getrlimit /* old_getrlimit */
+ .quad compat_sys_getrusage
+ .quad sys32_gettimeofday
+ .quad sys32_settimeofday
+ .quad sys_getgroups16 /* 80 */
+ .quad sys_setgroups16
+ .quad sys32_old_select
+ .quad sys_symlink
+ .quad sys_lstat
+ .quad sys_readlink /* 85 */
+#ifdef CONFIG_IA32_AOUT
+ .quad sys_uselib
+#else
+ .quad quiet_ni_syscall
+#endif
+ .quad sys_swapon
+ .quad sys_reboot
+ .quad compat_sys_old_readdir
+ .quad sys32_mmap /* 90 */
+ .quad sys_munmap
+ .quad sys_truncate
+ .quad sys_ftruncate
+ .quad sys_fchmod
+ .quad sys_fchown16 /* 95 */
+ .quad sys_getpriority
+ .quad sys_setpriority
+ .quad quiet_ni_syscall /* old profil syscall holder */
+ .quad compat_sys_statfs
+ .quad compat_sys_fstatfs /* 100 */
+ .quad sys_ioperm
+ .quad compat_sys_socketcall
+ .quad sys_syslog
+ .quad compat_sys_setitimer
+ .quad compat_sys_getitimer /* 105 */
+ .quad compat_sys_newstat
+ .quad compat_sys_newlstat
+ .quad compat_sys_newfstat
+ .quad sys32_uname
+ .quad stub32_iopl /* 110 */
+ .quad sys_vhangup
+ .quad quiet_ni_syscall /* old "idle" system call */
+ .quad sys32_vm86_warning /* vm86old */
+ .quad compat_sys_wait4
+ .quad sys_swapoff /* 115 */
+ .quad sys32_sysinfo
+ .quad sys32_ipc
+ .quad sys_fsync
+ .quad stub32_sigreturn
+ .quad stub32_clone /* 120 */
+ .quad sys_setdomainname
+ .quad sys_uname
+ .quad sys_modify_ldt
+ .quad sys32_adjtimex
+ .quad sys32_mprotect /* 125 */
+ .quad compat_sys_sigprocmask
+ .quad quiet_ni_syscall /* create_module */
+ .quad sys_init_module
+ .quad sys_delete_module
+ .quad quiet_ni_syscall /* 130 get_kernel_syms */
+ .quad sys_quotactl
+ .quad sys_getpgid
+ .quad sys_fchdir
+ .quad quiet_ni_syscall /* bdflush */
+ .quad sys_sysfs /* 135 */
+ .quad sys_personality
+ .quad quiet_ni_syscall /* for afs_syscall */
+ .quad sys_setfsuid16
+ .quad sys_setfsgid16
+ .quad sys_llseek /* 140 */
+ .quad compat_sys_getdents
+ .quad compat_sys_select
+ .quad sys_flock
+ .quad sys_msync
+ .quad compat_sys_readv /* 145 */
+ .quad compat_sys_writev
+ .quad sys_getsid
+ .quad sys_fdatasync
+ .quad sys32_sysctl /* sysctl */
+ .quad sys_mlock /* 150 */
+ .quad sys_munlock
+ .quad sys_mlockall
+ .quad sys_munlockall
+ .quad sys_sched_setparam
+ .quad sys_sched_getparam /* 155 */
+ .quad sys_sched_setscheduler
+ .quad sys_sched_getscheduler
+ .quad sys_sched_yield
+ .quad sys_sched_get_priority_max
+ .quad sys_sched_get_priority_min /* 160 */
+ .quad sys_sched_rr_get_interval
+ .quad compat_sys_nanosleep
+ .quad sys_mremap
+ .quad sys_setresuid16
+ .quad sys_getresuid16 /* 165 */
+ .quad sys32_vm86_warning /* vm86 */
+ .quad quiet_ni_syscall /* query_module */
+ .quad sys_poll
+ .quad compat_sys_nfsservctl
+ .quad sys_setresgid16 /* 170 */
+ .quad sys_getresgid16
+ .quad sys_prctl
+ .quad stub32_rt_sigreturn
+ .quad sys32_rt_sigaction
+ .quad sys32_rt_sigprocmask /* 175 */
+ .quad sys32_rt_sigpending
+ .quad compat_sys_rt_sigtimedwait
+ .quad sys32_rt_sigqueueinfo
+ .quad stub32_rt_sigsuspend
+ .quad sys32_pread /* 180 */
+ .quad sys32_pwrite
+ .quad sys_chown16
+ .quad sys_getcwd
+ .quad sys_capget
+ .quad sys_capset
+ .quad stub32_sigaltstack
+ .quad sys32_sendfile
+ .quad quiet_ni_syscall /* streams1 */
+ .quad quiet_ni_syscall /* streams2 */
+ .quad stub32_vfork /* 190 */
+ .quad compat_sys_getrlimit
+ .quad sys32_mmap2
+ .quad sys32_truncate64
+ .quad sys32_ftruncate64
+ .quad sys32_stat64 /* 195 */
+ .quad sys32_lstat64
+ .quad sys32_fstat64
+ .quad sys_lchown
+ .quad sys_getuid
+ .quad sys_getgid /* 200 */
+ .quad sys_geteuid
+ .quad sys_getegid
+ .quad sys_setreuid
+ .quad sys_setregid
+ .quad sys_getgroups /* 205 */
+ .quad sys_setgroups
+ .quad sys_fchown
+ .quad sys_setresuid
+ .quad sys_getresuid
+ .quad sys_setresgid /* 210 */
+ .quad sys_getresgid
+ .quad sys_chown
+ .quad sys_setuid
+ .quad sys_setgid
+ .quad sys_setfsuid /* 215 */
+ .quad sys_setfsgid
+ .quad sys_pivot_root
+ .quad sys_mincore
+ .quad sys_madvise
+ .quad compat_sys_getdents64 /* 220 getdents64 */
+ .quad compat_sys_fcntl64
+ .quad quiet_ni_syscall /* tux */
+ .quad quiet_ni_syscall /* security */
+ .quad sys_gettid
+ .quad sys_readahead /* 225 */
+ .quad sys_setxattr
+ .quad sys_lsetxattr
+ .quad sys_fsetxattr
+ .quad sys_getxattr
+ .quad sys_lgetxattr /* 230 */
+ .quad sys_fgetxattr
+ .quad sys_listxattr
+ .quad sys_llistxattr
+ .quad sys_flistxattr
+ .quad sys_removexattr /* 235 */
+ .quad sys_lremovexattr
+ .quad sys_fremovexattr
+ .quad sys_tkill
+ .quad sys_sendfile64
+ .quad compat_sys_futex /* 240 */
+ .quad compat_sys_sched_setaffinity
+ .quad compat_sys_sched_getaffinity
+ .quad sys32_set_thread_area
+ .quad sys32_get_thread_area
+ .quad compat_sys_io_setup /* 245 */
+ .quad sys_io_destroy
+ .quad compat_sys_io_getevents
+ .quad compat_sys_io_submit
+ .quad sys_io_cancel
+ .quad sys_fadvise64 /* 250 */
+ .quad quiet_ni_syscall /* free_huge_pages */
+ .quad sys_exit_group
+ .quad sys32_lookup_dcookie
+ .quad sys_epoll_create
+ .quad sys_epoll_ctl /* 255 */
+ .quad sys_epoll_wait
+ .quad sys_remap_file_pages
+ .quad sys_set_tid_address
+ .quad sys32_timer_create
+ .quad compat_sys_timer_settime /* 260 */
+ .quad compat_sys_timer_gettime
+ .quad sys_timer_getoverrun
+ .quad sys_timer_delete
+ .quad compat_sys_clock_settime
+ .quad compat_sys_clock_gettime /* 265 */
+ .quad compat_sys_clock_getres
+ .quad compat_sys_clock_nanosleep
+ .quad compat_sys_statfs64
+ .quad compat_sys_fstatfs64
+ .quad sys_tgkill /* 270 */
+ .quad compat_sys_utimes
+ .quad sys32_fadvise64_64
+ .quad quiet_ni_syscall /* sys_vserver */
+ .quad sys_mbind
+ .quad compat_sys_get_mempolicy /* 275 */
+ .quad sys_set_mempolicy
+ .quad compat_sys_mq_open
+ .quad sys_mq_unlink
+ .quad compat_sys_mq_timedsend
+ .quad compat_sys_mq_timedreceive /* 280 */
+ .quad compat_sys_mq_notify
+ .quad compat_sys_mq_getsetattr
+ .quad quiet_ni_syscall /* reserved for kexec */
+ .quad compat_sys_waitid
+ .quad quiet_ni_syscall /* sys_altroot */
+ .quad sys_add_key
+ .quad sys_request_key
+ .quad sys_keyctl
+ /* don't forget to change IA32_NR_syscalls */
+ia32_syscall_end:
+ .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
+ .quad ni_syscall
+ .endr
diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c
new file mode 100644
index 000000000000..369151dc3213
--- /dev/null
+++ b/arch/x86_64/ia32/ipc32.c
@@ -0,0 +1,57 @@
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/syscalls.h>
+#include <linux/time.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/shm.h>
+#include <linux/ipc.h>
+#include <linux/compat.h>
+
+#include <asm-i386/ipc.h>
+
+asmlinkage long
+sys32_ipc(u32 call, int first, int second, int third,
+ compat_uptr_t ptr, u32 fifth)
+{
+ int version;
+
+ version = call >> 16; /* hack for backward compatibility */
+ call &= 0xffff;
+
+ switch (call) {
+ case SEMOP:
+ /* struct sembuf is the same on 32 and 64bit :)) */
+ return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
+ case SEMTIMEDOP:
+ return compat_sys_semtimedop(first, compat_ptr(ptr), second,
+ compat_ptr(fifth));
+ case SEMGET:
+ return sys_semget(first, second, third);
+ case SEMCTL:
+ return compat_sys_semctl(first, second, third, compat_ptr(ptr));
+
+ case MSGSND:
+ return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
+ case MSGRCV:
+ return compat_sys_msgrcv(first, second, fifth, third,
+ version, compat_ptr(ptr));
+ case MSGGET:
+ return sys_msgget((key_t) first, second);
+ case MSGCTL:
+ return compat_sys_msgctl(first, second, compat_ptr(ptr));
+
+ case SHMAT:
+ return compat_sys_shmat(first, second, third, version,
+ compat_ptr(ptr));
+ break;
+ case SHMDT:
+ return sys_shmdt(compat_ptr(ptr));
+ case SHMGET:
+ return sys_shmget(first, (unsigned)second, third);
+ case SHMCTL:
+ return compat_sys_shmctl(first, second, compat_ptr(ptr));
+ }
+ return -ENOSYS;
+}
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
new file mode 100644
index 000000000000..b98b6d2462f6
--- /dev/null
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -0,0 +1,379 @@
+/*
+ * 32bit ptrace for x86-64.
+ *
+ * Copyright 2001,2002 Andi Kleen, SuSE Labs.
+ * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier
+ * copyright.
+ *
+ * This allows to access 64bit processes too; but there is no way to see the extended
+ * register contents.
+ *
+ * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $
+ */
+
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/mm.h>
+#include <linux/ptrace.h>
+#include <asm/ptrace.h>
+#include <asm/compat.h>
+#include <asm/uaccess.h>
+#include <asm/user32.h>
+#include <asm/user.h>
+#include <asm/errno.h>
+#include <asm/debugreg.h>
+#include <asm/i387.h>
+#include <asm/fpu32.h>
+
+/* determines which flags the user has access to. */
+/* 1 = access 0 = no access */
+#define FLAG_MASK 0x44dd5UL
+
+#define R32(l,q) \
+ case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break
+
+static int putreg32(struct task_struct *child, unsigned regno, u32 val)
+{
+ int i;
+ __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs));
+
+ switch (regno) {
+ case offsetof(struct user32, regs.fs):
+ if (val && (val & 3) != 3) return -EIO;
+ child->thread.fs = val & 0xffff;
+ break;
+ case offsetof(struct user32, regs.gs):
+ if (val && (val & 3) != 3) return -EIO;
+ child->thread.gs = val & 0xffff;
+ break;
+ case offsetof(struct user32, regs.ds):
+ if (val && (val & 3) != 3) return -EIO;
+ child->thread.ds = val & 0xffff;
+ break;
+ case offsetof(struct user32, regs.es):
+ child->thread.es = val & 0xffff;
+ break;
+ case offsetof(struct user32, regs.ss):
+ if ((val & 3) != 3) return -EIO;
+ stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff;
+ break;
+ case offsetof(struct user32, regs.cs):
+ if ((val & 3) != 3) return -EIO;
+ stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff;
+ break;
+
+ R32(ebx, rbx);
+ R32(ecx, rcx);
+ R32(edx, rdx);
+ R32(edi, rdi);
+ R32(esi, rsi);
+ R32(ebp, rbp);
+ R32(eax, rax);
+ R32(orig_eax, orig_rax);
+ R32(eip, rip);
+ R32(esp, rsp);
+
+ case offsetof(struct user32, regs.eflags): {
+ __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8];
+ val &= FLAG_MASK;
+ *flags = val | (*flags & ~FLAG_MASK);
+ break;
+ }
+
+ case offsetof(struct user32, u_debugreg[4]):
+ case offsetof(struct user32, u_debugreg[5]):
+ return -EIO;
+
+ case offsetof(struct user32, u_debugreg[0]):
+ child->thread.debugreg0 = val;
+ break;
+
+ case offsetof(struct user32, u_debugreg[1]):
+ child->thread.debugreg1 = val;
+ break;
+
+ case offsetof(struct user32, u_debugreg[2]):
+ child->thread.debugreg2 = val;
+ break;
+
+ case offsetof(struct user32, u_debugreg[3]):
+ child->thread.debugreg3 = val;
+ break;
+
+ case offsetof(struct user32, u_debugreg[6]):
+ child->thread.debugreg6 = val;
+ break;
+
+ case offsetof(struct user32, u_debugreg[7]):
+ val &= ~DR_CONTROL_RESERVED;
+ /* See arch/i386/kernel/ptrace.c for an explanation of
+ * this awkward check.*/
+ for(i=0; i<4; i++)
+ if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
+ return -EIO;
+ child->thread.debugreg7 = val;
+ break;
+
+ default:
+ if (regno > sizeof(struct user32) || (regno & 3))
+ return -EIO;
+
+ /* Other dummy fields in the virtual user structure are ignored */
+ break;
+ }
+ return 0;
+}
+
+#undef R32
+
+#define R32(l,q) \
+ case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break
+
+static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
+{
+ __u64 *stack = (__u64 *)(child->thread.rsp0 - sizeof(struct pt_regs));
+
+ switch (regno) {
+ case offsetof(struct user32, regs.fs):
+ *val = child->thread.fs;
+ break;
+ case offsetof(struct user32, regs.gs):
+ *val = child->thread.gs;
+ break;
+ case offsetof(struct user32, regs.ds):
+ *val = child->thread.ds;
+ break;
+ case offsetof(struct user32, regs.es):
+ *val = child->thread.es;
+ break;
+
+ R32(cs, cs);
+ R32(ss, ss);
+ R32(ebx, rbx);
+ R32(ecx, rcx);
+ R32(edx, rdx);
+ R32(edi, rdi);
+ R32(esi, rsi);
+ R32(ebp, rbp);
+ R32(eax, rax);
+ R32(orig_eax, orig_rax);
+ R32(eip, rip);
+ R32(eflags, eflags);
+ R32(esp, rsp);
+
+ case offsetof(struct user32, u_debugreg[0]):
+ *val = child->thread.debugreg0;
+ break;
+ case offsetof(struct user32, u_debugreg[1]):
+ *val = child->thread.debugreg1;
+ break;
+ case offsetof(struct user32, u_debugreg[2]):
+ *val = child->thread.debugreg2;
+ break;
+ case offsetof(struct user32, u_debugreg[3]):
+ *val = child->thread.debugreg3;
+ break;
+ case offsetof(struct user32, u_debugreg[6]):
+ *val = child->thread.debugreg6;
+ break;
+ case offsetof(struct user32, u_debugreg[7]):
+ *val = child->thread.debugreg7;
+ break;
+
+ default:
+ if (regno > sizeof(struct user32) || (regno & 3))
+ return -EIO;
+
+ /* Other dummy fields in the virtual user structure are ignored */
+ *val = 0;
+ break;
+ }
+ return 0;
+}
+
+#undef R32
+
+static struct task_struct *find_target(int request, int pid, int *err)
+{
+ struct task_struct *child;
+
+ *err = -EPERM;
+ if (pid == 1)
+ return NULL;
+
+ *err = -ESRCH;
+ read_lock(&tasklist_lock);
+ child = find_task_by_pid(pid);
+ if (child)
+ get_task_struct(child);
+ read_unlock(&tasklist_lock);
+ if (child) {
+ *err = -EPERM;
+ if (child->pid == 1)
+ goto out;
+ *err = ptrace_check_attach(child, request == PTRACE_KILL);
+ if (*err < 0)
+ goto out;
+ return child;
+ }
+ out:
+ if (child)
+ put_task_struct(child);
+ return NULL;
+
+}
+
+asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
+{
+ struct task_struct *child;
+ struct pt_regs *childregs;
+ void __user *datap = compat_ptr(data);
+ int ret;
+ __u32 val;
+
+ switch (request) {
+ default:
+ return sys_ptrace(request, pid, addr, data);
+
+ case PTRACE_PEEKTEXT:
+ case PTRACE_PEEKDATA:
+ case PTRACE_POKEDATA:
+ case PTRACE_POKETEXT:
+ case PTRACE_POKEUSR:
+ case PTRACE_PEEKUSR:
+ case PTRACE_GETREGS:
+ case PTRACE_SETREGS:
+ case PTRACE_SETFPREGS:
+ case PTRACE_GETFPREGS:
+ case PTRACE_SETFPXREGS:
+ case PTRACE_GETFPXREGS:
+ case PTRACE_GETEVENTMSG:
+ break;
+ }
+
+ child = find_target(request, pid, &ret);
+ if (!child)
+ return ret;
+
+ childregs = (struct pt_regs *)(child->thread.rsp0 - sizeof(struct pt_regs));
+
+ switch (request) {
+ case PTRACE_PEEKDATA:
+ case PTRACE_PEEKTEXT:
+ ret = 0;
+ if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32))
+ ret = -EIO;
+ else
+ ret = put_user(val, (unsigned int __user *)datap);
+ break;
+
+ case PTRACE_POKEDATA:
+ case PTRACE_POKETEXT:
+ ret = 0;
+ if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32))
+ ret = -EIO;
+ break;
+
+ case PTRACE_PEEKUSR:
+ ret = getreg32(child, addr, &val);
+ if (ret == 0)
+ ret = put_user(val, (__u32 __user *)datap);
+ break;
+
+ case PTRACE_POKEUSR:
+ ret = putreg32(child, addr, data);
+ break;
+
+ case PTRACE_GETREGS: { /* Get all gp regs from the child. */
+ int i;
+ if (!access_ok(VERIFY_WRITE, datap, 16*4)) {
+ ret = -EIO;
+ break;
+ }
+ ret = 0;
+ for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) {
+ getreg32(child, i, &val);
+ ret |= __put_user(val,(u32 __user *)datap);
+ datap += sizeof(u32);
+ }
+ break;
+ }
+
+ case PTRACE_SETREGS: { /* Set all gp regs in the child. */
+ unsigned long tmp;
+ int i;
+ if (!access_ok(VERIFY_READ, datap, 16*4)) {
+ ret = -EIO;
+ break;
+ }
+ ret = 0;
+ for ( i = 0; i <= 16*4; i += sizeof(u32) ) {
+ ret |= __get_user(tmp, (u32 __user *)datap);
+ putreg32(child, i, tmp);
+ datap += sizeof(u32);
+ }
+ break;
+ }
+
+ case PTRACE_GETFPREGS:
+ ret = -EIO;
+ if (!access_ok(VERIFY_READ, compat_ptr(data),
+ sizeof(struct user_i387_struct)))
+ break;
+ save_i387_ia32(child, datap, childregs, 1);
+ ret = 0;
+ break;
+
+ case PTRACE_SETFPREGS:
+ ret = -EIO;
+ if (!access_ok(VERIFY_WRITE, datap,
+ sizeof(struct user_i387_struct)))
+ break;
+ ret = 0;
+ /* don't check EFAULT to be bug-to-bug compatible to i386 */
+ restore_i387_ia32(child, datap, 1);
+ break;
+
+ case PTRACE_GETFPXREGS: {
+ struct user32_fxsr_struct __user *u = datap;
+ init_fpu(child);
+ ret = -EIO;
+ if (!access_ok(VERIFY_WRITE, u, sizeof(*u)))
+ break;
+ ret = -EFAULT;
+ if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u)))
+ break;
+ ret = __put_user(childregs->cs, &u->fcs);
+ ret |= __put_user(child->thread.ds, &u->fos);
+ break;
+ }
+ case PTRACE_SETFPXREGS: {
+ struct user32_fxsr_struct __user *u = datap;
+ unlazy_fpu(child);
+ ret = -EIO;
+ if (!access_ok(VERIFY_READ, u, sizeof(*u)))
+ break;
+ /* no checking to be bug-to-bug compatible with i386 */
+ __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u));
+ set_stopped_child_used_math(child);
+ child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+ ret = 0;
+ break;
+ }
+
+ case PTRACE_GETEVENTMSG:
+ ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data));
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ put_task_struct(child);
+ return ret;
+}
+
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
new file mode 100644
index 000000000000..68a9ab06ee7c
--- /dev/null
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -0,0 +1,1050 @@
+/*
+ * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on
+ * sys_sparc32
+ *
+ * Copyright (C) 2000 VA Linux Co
+ * Copyright (C) 2000 Don Dugger <n0ano@valinux.com>
+ * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com>
+ * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 2000 Hewlett-Packard Co.
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port)
+ *
+ * These routines maintain argument size conversion between 32bit and 64bit
+ * environment. In 2.5 most of this should be moved to a generic directory.
+ *
+ * This file assumes that there is a hole at the end of user address space.
+ *
+ * Some of the functions are LE specific currently. These are hopefully all marked.
+ * This should be fixed.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/syscalls.h>
+#include <linux/resource.h>
+#include <linux/times.h>
+#include <linux/utsname.h>
+#include <linux/timex.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/nfs_fs.h>
+#include <linux/quota.h>
+#include <linux/module.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/nfsd/nfsd.h>
+#include <linux/nfsd/cache.h>
+#include <linux/nfsd/xdr.h>
+#include <linux/nfsd/syscall.h>
+#include <linux/poll.h>
+#include <linux/personality.h>
+#include <linux/stat.h>
+#include <linux/ipc.h>
+#include <linux/rwsem.h>
+#include <linux/binfmts.h>
+#include <linux/init.h>
+#include <linux/aio_abi.h>
+#include <linux/aio.h>
+#include <linux/compat.h>
+#include <linux/vfs.h>
+#include <linux/ptrace.h>
+#include <linux/highuid.h>
+#include <linux/vmalloc.h>
+#include <asm/mman.h>
+#include <asm/types.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <asm/atomic.h>
+#include <asm/ldt.h>
+
+#include <net/scm.h>
+#include <net/sock.h>
+#include <asm/ia32.h>
+
+#define AA(__x) ((unsigned long)(__x))
+
+int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
+{
+ typeof(ubuf->st_uid) uid = 0;
+ typeof(ubuf->st_gid) gid = 0;
+ SET_UID(uid, kbuf->uid);
+ SET_GID(gid, kbuf->gid);
+ if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
+ return -EOVERFLOW;
+ if (kbuf->size >= 0x7fffffff)
+ return -EOVERFLOW;
+ if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
+ __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
+ __put_user (kbuf->ino, &ubuf->st_ino) ||
+ __put_user (kbuf->mode, &ubuf->st_mode) ||
+ __put_user (kbuf->nlink, &ubuf->st_nlink) ||
+ __put_user (uid, &ubuf->st_uid) ||
+ __put_user (gid, &ubuf->st_gid) ||
+ __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
+ __put_user (kbuf->size, &ubuf->st_size) ||
+ __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) ||
+ __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+ __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
+ __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+ __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
+ __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+ __put_user (kbuf->blksize, &ubuf->st_blksize) ||
+ __put_user (kbuf->blocks, &ubuf->st_blocks))
+ return -EFAULT;
+ return 0;
+}
+
+asmlinkage long
+sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high)
+{
+ return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
+}
+
+asmlinkage long
+sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high)
+{
+ return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
+}
+
+/* Another set for IA32/LFS -- x86_64 struct stat is different due to
+ support for 64bit inode numbers. */
+
+static int
+cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
+{
+ typeof(ubuf->st_uid) uid = 0;
+ typeof(ubuf->st_gid) gid = 0;
+ SET_UID(uid, stat->uid);
+ SET_GID(gid, stat->gid);
+ if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) ||
+ __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
+ __put_user (stat->ino, &ubuf->__st_ino) ||
+ __put_user (stat->ino, &ubuf->st_ino) ||
+ __put_user (stat->mode, &ubuf->st_mode) ||
+ __put_user (stat->nlink, &ubuf->st_nlink) ||
+ __put_user (uid, &ubuf->st_uid) ||
+ __put_user (gid, &ubuf->st_gid) ||
+ __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
+ __put_user (stat->size, &ubuf->st_size) ||
+ __put_user (stat->atime.tv_sec, &ubuf->st_atime) ||
+ __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
+ __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) ||
+ __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
+ __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) ||
+ __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
+ __put_user (stat->blksize, &ubuf->st_blksize) ||
+ __put_user (stat->blocks, &ubuf->st_blocks))
+ return -EFAULT;
+ return 0;
+}
+
+asmlinkage long
+sys32_stat64(char __user * filename, struct stat64 __user *statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_stat(filename, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+asmlinkage long
+sys32_lstat64(char __user * filename, struct stat64 __user *statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_lstat(filename, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+asmlinkage long
+sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
+{
+ struct kstat stat;
+ int ret = vfs_fstat(fd, &stat);
+ if (!ret)
+ ret = cp_stat64(statbuf, &stat);
+ return ret;
+}
+
+/*
+ * Linux/i386 didn't use to be able to handle more than
+ * 4 system call parameters, so these system calls used a memory
+ * block for parameter passing..
+ */
+
+struct mmap_arg_struct {
+ unsigned int addr;
+ unsigned int len;
+ unsigned int prot;
+ unsigned int flags;
+ unsigned int fd;
+ unsigned int offset;
+};
+
+asmlinkage long
+sys32_mmap(struct mmap_arg_struct __user *arg)
+{
+ struct mmap_arg_struct a;
+ struct file *file = NULL;
+ unsigned long retval;
+ struct mm_struct *mm ;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+
+ if (a.offset & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (!(a.flags & MAP_ANONYMOUS)) {
+ file = fget(a.fd);
+ if (!file)
+ return -EBADF;
+ }
+
+ mm = current->mm;
+ down_write(&mm->mmap_sem);
+ retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT);
+ if (file)
+ fput(file);
+
+ up_write(&mm->mmap_sem);
+
+ return retval;
+}
+
+asmlinkage long
+sys32_mprotect(unsigned long start, size_t len, unsigned long prot)
+{
+ return sys_mprotect(start,len,prot);
+}
+
+asmlinkage long
+sys32_pipe(int __user *fd)
+{
+ int retval;
+ int fds[2];
+
+ retval = do_pipe(fds);
+ if (retval)
+ goto out;
+ if (copy_to_user(fd, fds, sizeof(fds)))
+ retval = -EFAULT;
+ out:
+ return retval;
+}
+
+asmlinkage long
+sys32_rt_sigaction(int sig, struct sigaction32 __user *act,
+ struct sigaction32 __user *oact, unsigned int sigsetsize)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+ compat_sigset_t set32;
+
+ /* XXX: Don't preclude handling different sized sigset_t's. */
+ if (sigsetsize != sizeof(compat_sigset_t))
+ return -EINVAL;
+
+ if (act) {
+ compat_uptr_t handler, restorer;
+
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(handler, &act->sa_handler) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(restorer, &act->sa_restorer)||
+ __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ new_ka.sa.sa_handler = compat_ptr(handler);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+ /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
+ switch (_NSIG_WORDS) {
+ case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6]
+ | (((long)set32.sig[7]) << 32);
+ case 3: new_ka.sa.sa_mask.sig[2] = set32.sig[4]
+ | (((long)set32.sig[5]) << 32);
+ case 2: new_ka.sa.sa_mask.sig[1] = set32.sig[2]
+ | (((long)set32.sig[3]) << 32);
+ case 1: new_ka.sa.sa_mask.sig[0] = set32.sig[0]
+ | (((long)set32.sig[1]) << 32);
+ }
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */
+ switch (_NSIG_WORDS) {
+ case 4:
+ set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32);
+ set32.sig[6] = old_ka.sa.sa_mask.sig[3];
+ case 3:
+ set32.sig[5] = (old_ka.sa.sa_mask.sig[2] >> 32);
+ set32.sig[4] = old_ka.sa.sa_mask.sig[2];
+ case 2:
+ set32.sig[3] = (old_ka.sa.sa_mask.sig[1] >> 32);
+ set32.sig[2] = old_ka.sa.sa_mask.sig[1];
+ case 1:
+ set32.sig[1] = (old_ka.sa.sa_mask.sig[0] >> 32);
+ set32.sig[0] = old_ka.sa.sa_mask.sig[0];
+ }
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+asmlinkage long
+sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact)
+{
+ struct k_sigaction new_ka, old_ka;
+ int ret;
+
+ if (act) {
+ compat_old_sigset_t mask;
+ compat_uptr_t handler, restorer;
+
+ if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+ __get_user(handler, &act->sa_handler) ||
+ __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
+ __get_user(restorer, &act->sa_restorer) ||
+ __get_user(mask, &act->sa_mask))
+ return -EFAULT;
+
+ new_ka.sa.sa_handler = compat_ptr(handler);
+ new_ka.sa.sa_restorer = compat_ptr(restorer);
+
+ siginitset(&new_ka.sa.sa_mask, mask);
+ }
+
+ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+ if (!ret && oact) {
+ if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) ||
+ __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) ||
+ __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
+ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+asmlinkage long
+sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
+ compat_sigset_t __user *oset, unsigned int sigsetsize)
+{
+ sigset_t s;
+ compat_sigset_t s32;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ if (set) {
+ if (copy_from_user (&s32, set, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ switch (_NSIG_WORDS) {
+ case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32);
+ case 3: s.sig[2] = s32.sig[4] | (((long)s32.sig[5]) << 32);
+ case 2: s.sig[1] = s32.sig[2] | (((long)s32.sig[3]) << 32);
+ case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32);
+ }
+ }
+ set_fs (KERNEL_DS);
+ ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL,
+ sigsetsize);
+ set_fs (old_fs);
+ if (ret) return ret;
+ if (oset) {
+ switch (_NSIG_WORDS) {
+ case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
+ case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
+ case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
+ case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
+ }
+ if (copy_to_user (oset, &s32, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static inline long
+get_tv32(struct timeval *o, struct compat_timeval __user *i)
+{
+ int err = -EFAULT;
+ if (access_ok(VERIFY_READ, i, sizeof(*i))) {
+ err = __get_user(o->tv_sec, &i->tv_sec);
+ err |= __get_user(o->tv_usec, &i->tv_usec);
+ }
+ return err;
+}
+
+static inline long
+put_tv32(struct compat_timeval __user *o, struct timeval *i)
+{
+ int err = -EFAULT;
+ if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
+ err = __put_user(i->tv_sec, &o->tv_sec);
+ err |= __put_user(i->tv_usec, &o->tv_usec);
+ }
+ return err;
+}
+
+extern int do_setitimer(int which, struct itimerval *, struct itimerval *);
+
+asmlinkage long
+sys32_alarm(unsigned int seconds)
+{
+ struct itimerval it_new, it_old;
+ unsigned int oldalarm;
+
+ it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+ it_new.it_value.tv_sec = seconds;
+ it_new.it_value.tv_usec = 0;
+ do_setitimer(ITIMER_REAL, &it_new, &it_old);
+ oldalarm = it_old.it_value.tv_sec;
+ /* ehhh.. We can't return 0 if we have an alarm pending.. */
+ /* And we'd better return too much than too little anyway */
+ if (it_old.it_value.tv_usec)
+ oldalarm++;
+ return oldalarm;
+}
+
+/* Translations due to time_t size differences. Which affects all
+ sorts of things, like timeval and itimerval. */
+
+extern struct timezone sys_tz;
+
+asmlinkage long
+sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+{
+ if (tv) {
+ struct timeval ktv;
+ do_gettimeofday(&ktv);
+ if (put_tv32(tv, &ktv))
+ return -EFAULT;
+ }
+ if (tz) {
+ if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+asmlinkage long
+sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz)
+{
+ struct timeval ktv;
+ struct timespec kts;
+ struct timezone ktz;
+
+ if (tv) {
+ if (get_tv32(&ktv, tv))
+ return -EFAULT;
+ kts.tv_sec = ktv.tv_sec;
+ kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
+ }
+ if (tz) {
+ if (copy_from_user(&ktz, tz, sizeof(ktz)))
+ return -EFAULT;
+ }
+
+ return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+}
+
+struct sel_arg_struct {
+ unsigned int n;
+ unsigned int inp;
+ unsigned int outp;
+ unsigned int exp;
+ unsigned int tvp;
+};
+
+asmlinkage long
+sys32_old_select(struct sel_arg_struct __user *arg)
+{
+ struct sel_arg_struct a;
+
+ if (copy_from_user(&a, arg, sizeof(a)))
+ return -EFAULT;
+ return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+ compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
+extern asmlinkage long
+compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options,
+ struct compat_rusage *ru);
+
+asmlinkage long
+sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
+{
+ return compat_sys_wait4(pid, stat_addr, options, NULL);
+}
+
+int sys32_ni_syscall(int call)
+{
+ struct task_struct *me = current;
+ static char lastcomm[sizeof(me->comm)];
+
+ if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
+ printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
+ call, me->comm);
+ strncpy(lastcomm, me->comm, sizeof(lastcomm));
+ }
+ return -ENOSYS;
+}
+
+/* 32-bit timeval and related flotsam. */
+
+asmlinkage long
+sys32_sysfs(int option, u32 arg1, u32 arg2)
+{
+ return sys_sysfs(option, arg1, arg2);
+}
+
+struct sysinfo32 {
+ s32 uptime;
+ u32 loads[3];
+ u32 totalram;
+ u32 freeram;
+ u32 sharedram;
+ u32 bufferram;
+ u32 totalswap;
+ u32 freeswap;
+ unsigned short procs;
+ unsigned short pad;
+ u32 totalhigh;
+ u32 freehigh;
+ u32 mem_unit;
+ char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+
+asmlinkage long
+sys32_sysinfo(struct sysinfo32 __user *info)
+{
+ struct sysinfo s;
+ int ret;
+ mm_segment_t old_fs = get_fs ();
+ int bitcount = 0;
+
+ set_fs (KERNEL_DS);
+ ret = sys_sysinfo(&s);
+ set_fs (old_fs);
+
+ /* Check to see if any memory value is too large for 32-bit and scale
+ * down if needed
+ */
+ if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ while (s.mem_unit < PAGE_SIZE) {
+ s.mem_unit <<= 1;
+ bitcount++;
+ }
+ s.totalram >>= bitcount;
+ s.freeram >>= bitcount;
+ s.sharedram >>= bitcount;
+ s.bufferram >>= bitcount;
+ s.totalswap >>= bitcount;
+ s.freeswap >>= bitcount;
+ s.totalhigh >>= bitcount;
+ s.freehigh >>= bitcount;
+ }
+
+ if (!access_ok(VERIFY_WRITE, info, sizeof(struct sysinfo32)) ||
+ __put_user (s.uptime, &info->uptime) ||
+ __put_user (s.loads[0], &info->loads[0]) ||
+ __put_user (s.loads[1], &info->loads[1]) ||
+ __put_user (s.loads[2], &info->loads[2]) ||
+ __put_user (s.totalram, &info->totalram) ||
+ __put_user (s.freeram, &info->freeram) ||
+ __put_user (s.sharedram, &info->sharedram) ||
+ __put_user (s.bufferram, &info->bufferram) ||
+ __put_user (s.totalswap, &info->totalswap) ||
+ __put_user (s.freeswap, &info->freeswap) ||
+ __put_user (s.procs, &info->procs) ||
+ __put_user (s.totalhigh, &info->totalhigh) ||
+ __put_user (s.freehigh, &info->freehigh) ||
+ __put_user (s.mem_unit, &info->mem_unit))
+ return -EFAULT;
+ return 0;
+}
+
+asmlinkage long
+sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval)
+{
+ struct timespec t;
+ int ret;
+ mm_segment_t old_fs = get_fs ();
+
+ set_fs (KERNEL_DS);
+ ret = sys_sched_rr_get_interval(pid, &t);
+ set_fs (old_fs);
+ if (put_compat_timespec(&t, interval))
+ return -EFAULT;
+ return ret;
+}
+
+asmlinkage long
+sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
+{
+ sigset_t s;
+ compat_sigset_t s32;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs (KERNEL_DS);
+ ret = sys_rt_sigpending(&s, sigsetsize);
+ set_fs (old_fs);
+ if (!ret) {
+ switch (_NSIG_WORDS) {
+ case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3];
+ case 3: s32.sig[5] = (s.sig[2] >> 32); s32.sig[4] = s.sig[2];
+ case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1];
+ case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0];
+ }
+ if (copy_to_user (set, &s32, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+asmlinkage long
+sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
+{
+ siginfo_t info;
+ int ret;
+ mm_segment_t old_fs = get_fs();
+
+ if (copy_siginfo_from_user32(&info, uinfo))
+ return -EFAULT;
+ set_fs (KERNEL_DS);
+ ret = sys_rt_sigqueueinfo(pid, sig, &info);
+ set_fs (old_fs);
+ return ret;
+}
+
+/* These are here just in case some old ia32 binary calls it. */
+asmlinkage long
+sys32_pause(void)
+{
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ return -ERESTARTNOHAND;
+}
+
+
+#ifdef CONFIG_SYSCTL
+struct sysctl_ia32 {
+ unsigned int name;
+ int nlen;
+ unsigned int oldval;
+ unsigned int oldlenp;
+ unsigned int newval;
+ unsigned int newlen;
+ unsigned int __unused[4];
+};
+
+
+asmlinkage long
+sys32_sysctl(struct sysctl_ia32 __user *args32)
+{
+ struct sysctl_ia32 a32;
+ mm_segment_t old_fs = get_fs ();
+ void __user *oldvalp, *newvalp;
+ size_t oldlen;
+ int __user *namep;
+ long ret;
+ extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
+ void *newval, size_t newlen);
+
+
+ if (copy_from_user(&a32, args32, sizeof (a32)))
+ return -EFAULT;
+
+ /*
+ * We need to pre-validate these because we have to disable address checking
+ * before calling do_sysctl() because of OLDLEN but we can't run the risk of the
+ * user specifying bad addresses here. Well, since we're dealing with 32 bit
+ * addresses, we KNOW that access_ok() will always succeed, so this is an
+ * expensive NOP, but so what...
+ */
+ namep = compat_ptr(a32.name);
+ oldvalp = compat_ptr(a32.oldval);
+ newvalp = compat_ptr(a32.newval);
+
+ if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
+ || !access_ok(VERIFY_WRITE, namep, 0)
+ || !access_ok(VERIFY_WRITE, oldvalp, 0)
+ || !access_ok(VERIFY_WRITE, newvalp, 0))
+ return -EFAULT;
+
+ set_fs(KERNEL_DS);
+ lock_kernel();
+ ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen);
+ unlock_kernel();
+ set_fs(old_fs);
+
+ if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp)))
+ return -EFAULT;
+
+ return ret;
+}
+#endif
+
+/* warning: next two assume little endian */
+asmlinkage long
+sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
+{
+ return sys_pread64(fd, ubuf, count,
+ ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+asmlinkage long
+sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi)
+{
+ return sys_pwrite64(fd, ubuf, count,
+ ((loff_t)AA(poshi) << 32) | AA(poslo));
+}
+
+
+asmlinkage long
+sys32_personality(unsigned long personality)
+{
+ int ret;
+ if (personality(current->personality) == PER_LINUX32 &&
+ personality == PER_LINUX)
+ personality = PER_LINUX32;
+ ret = sys_personality(personality);
+ if (ret == PER_LINUX32)
+ ret = PER_LINUX;
+ return ret;
+}
+
+asmlinkage long
+sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
+{
+ mm_segment_t old_fs = get_fs();
+ int ret;
+ off_t of;
+
+ if (offset && get_user(of, offset))
+ return -EFAULT;
+
+ set_fs(KERNEL_DS);
+ ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count);
+ set_fs(old_fs);
+
+ if (!ret && offset && put_user(of, offset))
+ return -EFAULT;
+
+ return ret;
+}
+
+/* Handle adjtimex compatibility. */
+
+struct timex32 {
+ u32 modes;
+ s32 offset, freq, maxerror, esterror;
+ s32 status, constant, precision, tolerance;
+ struct compat_timeval time;
+ s32 tick;
+ s32 ppsfreq, jitter, shift, stabil;
+ s32 jitcnt, calcnt, errcnt, stbcnt;
+ s32 :32; s32 :32; s32 :32; s32 :32;
+ s32 :32; s32 :32; s32 :32; s32 :32;
+ s32 :32; s32 :32; s32 :32; s32 :32;
+};
+
+extern int do_adjtimex(struct timex *);
+
+asmlinkage long
+sys32_adjtimex(struct timex32 __user *utp)
+{
+ struct timex txc;
+ int ret;
+
+ memset(&txc, 0, sizeof(struct timex));
+
+ if (!access_ok(VERIFY_READ, utp, sizeof(struct timex32)) ||
+ __get_user(txc.modes, &utp->modes) ||
+ __get_user(txc.offset, &utp->offset) ||
+ __get_user(txc.freq, &utp->freq) ||
+ __get_user(txc.maxerror, &utp->maxerror) ||
+ __get_user(txc.esterror, &utp->esterror) ||
+ __get_user(txc.status, &utp->status) ||
+ __get_user(txc.constant, &utp->constant) ||
+ __get_user(txc.precision, &utp->precision) ||
+ __get_user(txc.tolerance, &utp->tolerance) ||
+ __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+ __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+ __get_user(txc.tick, &utp->tick) ||
+ __get_user(txc.ppsfreq, &utp->ppsfreq) ||
+ __get_user(txc.jitter, &utp->jitter) ||
+ __get_user(txc.shift, &utp->shift) ||
+ __get_user(txc.stabil, &utp->stabil) ||
+ __get_user(txc.jitcnt, &utp->jitcnt) ||
+ __get_user(txc.calcnt, &utp->calcnt) ||
+ __get_user(txc.errcnt, &utp->errcnt) ||
+ __get_user(txc.stbcnt, &utp->stbcnt))
+ return -EFAULT;
+
+ ret = do_adjtimex(&txc);
+
+ if (!access_ok(VERIFY_WRITE, utp, sizeof(struct timex32)) ||
+ __put_user(txc.modes, &utp->modes) ||
+ __put_user(txc.offset, &utp->offset) ||
+ __put_user(txc.freq, &utp->freq) ||
+ __put_user(txc.maxerror, &utp->maxerror) ||
+ __put_user(txc.esterror, &utp->esterror) ||
+ __put_user(txc.status, &utp->status) ||
+ __put_user(txc.constant, &utp->constant) ||
+ __put_user(txc.precision, &utp->precision) ||
+ __put_user(txc.tolerance, &utp->tolerance) ||
+ __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
+ __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
+ __put_user(txc.tick, &utp->tick) ||
+ __put_user(txc.ppsfreq, &utp->ppsfreq) ||
+ __put_user(txc.jitter, &utp->jitter) ||
+ __put_user(txc.shift, &utp->shift) ||
+ __put_user(txc.stabil, &utp->stabil) ||
+ __put_user(txc.jitcnt, &utp->jitcnt) ||
+ __put_user(txc.calcnt, &utp->calcnt) ||
+ __put_user(txc.errcnt, &utp->errcnt) ||
+ __put_user(txc.stbcnt, &utp->stbcnt))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
+ unsigned long prot, unsigned long flags,
+ unsigned long fd, unsigned long pgoff)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long error;
+ struct file * file = NULL;
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+ if (!(flags & MAP_ANONYMOUS)) {
+ file = fget(fd);
+ if (!file)
+ return -EBADF;
+ }
+
+ down_write(&mm->mmap_sem);
+ error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ up_write(&mm->mmap_sem);
+
+ if (file)
+ fput(file);
+ return error;
+}
+
+asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
+{
+ int error;
+
+ if (!name)
+ return -EFAULT;
+ if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname)))
+ return -EFAULT;
+
+ down_read(&uts_sem);
+
+ error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
+ __put_user(0,name->sysname+__OLD_UTS_LEN);
+ __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
+ __put_user(0,name->nodename+__OLD_UTS_LEN);
+ __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
+ __put_user(0,name->release+__OLD_UTS_LEN);
+ __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
+ __put_user(0,name->version+__OLD_UTS_LEN);
+ {
+ char *arch = "x86_64";
+ if (personality(current->personality) == PER_LINUX32)
+ arch = "i686";
+
+ __copy_to_user(&name->machine,arch,strlen(arch)+1);
+ }
+
+ up_read(&uts_sem);
+
+ error = error ? -EFAULT : 0;
+
+ return error;
+}
+
+long sys32_uname(struct old_utsname __user * name)
+{
+ int err;
+ if (!name)
+ return -EFAULT;
+ down_read(&uts_sem);
+ err=copy_to_user(name, &system_utsname, sizeof (*name));
+ up_read(&uts_sem);
+ if (personality(current->personality) == PER_LINUX32)
+ err |= copy_to_user(&name->machine, "i686", 5);
+ return err?-EFAULT:0;
+}
+
+long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
+{
+ struct ustat u;
+ mm_segment_t seg;
+ int ret;
+
+ seg = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_ustat(dev,&u);
+ set_fs(seg);
+ if (ret >= 0) {
+ if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) ||
+ __put_user((__u32) u.f_tfree, &u32p->f_tfree) ||
+ __put_user((__u32) u.f_tinode, &u32p->f_tfree) ||
+ __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) ||
+ __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack)))
+ ret = -EFAULT;
+ }
+ return ret;
+}
+
+asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
+ compat_uptr_t __user *envp, struct pt_regs *regs)
+{
+ long error;
+ char * filename;
+
+ filename = getname(name);
+ error = PTR_ERR(filename);
+ if (IS_ERR(filename))
+ return error;
+ error = compat_do_execve(filename, argv, envp, regs);
+ if (error == 0) {
+ task_lock(current);
+ current->ptrace &= ~PT_DTRACE;
+ task_unlock(current);
+ }
+ putname(filename);
+ return error;
+}
+
+asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp,
+ struct pt_regs *regs)
+{
+ void __user *parent_tid = (void __user *)regs->rdx;
+ void __user *child_tid = (void __user *)regs->rdi;
+ if (!newsp)
+ newsp = regs->rsp;
+ return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
+}
+
+/*
+ * Some system calls that need sign extended arguments. This could be done by a generic wrapper.
+ */
+
+long sys32_lseek (unsigned int fd, int offset, unsigned int whence)
+{
+ return sys_lseek(fd, offset, whence);
+}
+
+long sys32_kill(int pid, int sig)
+{
+ return sys_kill(pid, sig);
+}
+
+asmlinkage long sys32_open(const char __user * filename, int flags, int mode)
+{
+ char * tmp;
+ int fd, error;
+
+ /* don't force O_LARGEFILE */
+ tmp = getname(filename);
+ fd = PTR_ERR(tmp);
+ if (!IS_ERR(tmp)) {
+ fd = get_unused_fd();
+ if (fd >= 0) {
+ struct file *f = filp_open(tmp, flags, mode);
+ error = PTR_ERR(f);
+ if (IS_ERR(f)) {
+ put_unused_fd(fd);
+ fd = error;
+ } else
+ fd_install(fd, f);
+ }
+ putname(tmp);
+ }
+ return fd;
+}
+
+extern asmlinkage long
+sys_timer_create(clockid_t which_clock,
+ struct sigevent __user *timer_event_spec,
+ timer_t __user * created_timer_id);
+
+long
+sys32_timer_create(u32 clock, struct compat_sigevent __user *se32, timer_t __user *timer_id)
+{
+ struct sigevent __user *p = NULL;
+ if (se32) {
+ struct sigevent se;
+ p = compat_alloc_user_space(sizeof(struct sigevent));
+ if (get_compat_sigevent(&se, se32) ||
+ copy_to_user(p, &se, sizeof(se)))
+ return -EFAULT;
+ }
+ return sys_timer_create(clock, p, timer_id);
+}
+
+long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
+ __u32 len_low, __u32 len_high, int advice)
+{
+ return sys_fadvise64_64(fd,
+ (((u64)offset_high)<<32) | offset_low,
+ (((u64)len_high)<<32) | len_low,
+ advice);
+}
+
+long sys32_vm86_warning(void)
+{
+ struct task_struct *me = current;
+ static char lastcomm[sizeof(me->comm)];
+ if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
+ printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
+ me->comm);
+ strncpy(lastcomm, me->comm, sizeof(lastcomm));
+ }
+ return -ENOSYS;
+}
+
+long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
+ char __user * buf, size_t len)
+{
+ return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
+}
+
+static int __init ia32_init (void)
+{
+ printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n");
+ return 0;
+}
+
+__initcall(ia32_init);
+
+extern unsigned long ia32_sys_call_table[];
+EXPORT_SYMBOL(ia32_sys_call_table);
diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c
new file mode 100644
index 000000000000..399ff4985099
--- /dev/null
+++ b/arch/x86_64/ia32/syscall32.c
@@ -0,0 +1,111 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs */
+
+/* vsyscall handling for 32bit processes. Map a stub page into it
+ on demand because 32bit cannot reach the kernel's fixmaps */
+
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/stringify.h>
+#include <asm/proto.h>
+#include <asm/tlbflush.h>
+#include <asm/ia32_unistd.h>
+
+/* 32bit VDSOs mapped into user space. */
+asm(".section \".init.data\",\"aw\"\n"
+ "syscall32_syscall:\n"
+ ".incbin \"arch/x86_64/ia32/vsyscall-syscall.so\"\n"
+ "syscall32_syscall_end:\n"
+ "syscall32_sysenter:\n"
+ ".incbin \"arch/x86_64/ia32/vsyscall-sysenter.so\"\n"
+ "syscall32_sysenter_end:\n"
+ ".previous");
+
+extern unsigned char syscall32_syscall[], syscall32_syscall_end[];
+extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[];
+extern int sysctl_vsyscall32;
+
+char *syscall32_page;
+static int use_sysenter = -1;
+
+/*
+ * Map the 32bit vsyscall page on demand.
+ *
+ * RED-PEN: This knows too much about high level VM.
+ *
+ * Alternative would be to generate a vma with appropriate backing options
+ * and let it be handled by generic VM.
+ */
+int __map_syscall32(struct mm_struct *mm, unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pte_t *pte;
+ pmd_t *pmd;
+ int err = -ENOMEM;
+
+ spin_lock(&mm->page_table_lock);
+ pgd = pgd_offset(mm, address);
+ pud = pud_alloc(mm, pgd, address);
+ if (pud) {
+ pmd = pmd_alloc(mm, pud, address);
+ if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) {
+ if (pte_none(*pte)) {
+ set_pte(pte,
+ mk_pte(virt_to_page(syscall32_page),
+ PAGE_KERNEL_VSYSCALL32));
+ }
+ /* Flush only the local CPU. Other CPUs taking a fault
+ will just end up here again
+ This probably not needed and just paranoia. */
+ __flush_tlb_one(address);
+ err = 0;
+ }
+ }
+ spin_unlock(&mm->page_table_lock);
+ return err;
+}
+
+int map_syscall32(struct mm_struct *mm, unsigned long address)
+{
+ int err;
+ down_read(&mm->mmap_sem);
+ err = __map_syscall32(mm, address);
+ up_read(&mm->mmap_sem);
+ return err;
+}
+
+static int __init init_syscall32(void)
+{
+ syscall32_page = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!syscall32_page)
+ panic("Cannot allocate syscall32 page");
+ SetPageReserved(virt_to_page(syscall32_page));
+ if (use_sysenter > 0) {
+ memcpy(syscall32_page, syscall32_sysenter,
+ syscall32_sysenter_end - syscall32_sysenter);
+ } else {
+ memcpy(syscall32_page, syscall32_syscall,
+ syscall32_syscall_end - syscall32_syscall);
+ }
+ return 0;
+}
+
+__initcall(init_syscall32);
+
+/* May not be __init: called during resume */
+void syscall32_cpu_init(void)
+{
+ if (use_sysenter < 0)
+ use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
+
+ /* Load these always in case some future AMD CPU supports
+ SYSENTER from compat mode too. */
+ checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
+ checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+
+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
+}
diff --git a/arch/x86_64/ia32/tls32.c b/arch/x86_64/ia32/tls32.c
new file mode 100644
index 000000000000..1cc4340de3ca
--- /dev/null
+++ b/arch/x86_64/ia32/tls32.c
@@ -0,0 +1,163 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/user.h>
+
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/proto.h>
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+ struct thread_struct *t = &current->thread;
+ int idx;
+
+ for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+ if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx))
+ return idx + GDT_ENTRY_TLS_MIN;
+ return -ESRCH;
+}
+
+/*
+ * Set a given TLS descriptor:
+ * When you want addresses > 32bit use arch_prctl()
+ */
+int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+{
+ struct user_desc info;
+ struct n_desc_struct *desc;
+ int cpu, idx;
+
+ if (copy_from_user(&info, u_info, sizeof(info)))
+ return -EFAULT;
+
+ idx = info.entry_number;
+
+ /*
+ * index -1 means the kernel should try to find and
+ * allocate an empty descriptor:
+ */
+ if (idx == -1) {
+ idx = get_free_idx();
+ if (idx < 0)
+ return idx;
+ if (put_user(idx, &u_info->entry_number))
+ return -EFAULT;
+ }
+
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+
+ /*
+ * We must not get preempted while modifying the TLS.
+ */
+ cpu = get_cpu();
+
+ if (LDT_empty(&info)) {
+ desc->a = 0;
+ desc->b = 0;
+ } else {
+ desc->a = LDT_entry_a(&info);
+ desc->b = LDT_entry_b(&info);
+ }
+ if (t == &current->thread)
+ load_TLS(t, cpu);
+
+ put_cpu();
+ return 0;
+}
+
+asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info)
+{
+ return do_set_thread_area(&current->thread, u_info);
+}
+
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+#define GET_BASE(desc) ( \
+ (((desc)->a >> 16) & 0x0000ffff) | \
+ (((desc)->b << 16) & 0x00ff0000) | \
+ ( (desc)->b & 0xff000000) )
+
+#define GET_LIMIT(desc) ( \
+ ((desc)->a & 0x0ffff) | \
+ ((desc)->b & 0xf0000) )
+
+#define GET_32BIT(desc) (((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1)
+#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc) (((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc) (((desc)->b >> 20) & 1)
+#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1)
+
+int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info)
+{
+ struct user_desc info;
+ struct n_desc_struct *desc;
+ int idx;
+
+ if (get_user(idx, &u_info->entry_number))
+ return -EFAULT;
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN;
+
+ memset(&info, 0, sizeof(struct user_desc));
+ info.entry_number = idx;
+ info.base_addr = GET_BASE(desc);
+ info.limit = GET_LIMIT(desc);
+ info.seg_32bit = GET_32BIT(desc);
+ info.contents = GET_CONTENTS(desc);
+ info.read_exec_only = !GET_WRITABLE(desc);
+ info.limit_in_pages = GET_LIMIT_PAGES(desc);
+ info.seg_not_present = !GET_PRESENT(desc);
+ info.useable = GET_USEABLE(desc);
+ info.lm = GET_LONGMODE(desc);
+
+ if (copy_to_user(u_info, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info)
+{
+ return do_get_thread_area(&current->thread, u_info);
+}
+
+
+int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs)
+{
+ struct n_desc_struct *desc;
+ struct user_desc info;
+ struct user_desc __user *cp;
+ int idx;
+
+ cp = (void __user *)childregs->rsi;
+ if (copy_from_user(&info, cp, sizeof(info)))
+ return -EFAULT;
+ if (LDT_empty(&info))
+ return -EINVAL;
+
+ idx = info.entry_number;
+ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+ return -EINVAL;
+
+ desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN;
+ desc->a = LDT_entry_a(&info);
+ desc->b = LDT_entry_b(&info);
+
+ return 0;
+}
diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S
new file mode 100644
index 000000000000..ba4067d350e4
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-sigreturn.S
@@ -0,0 +1,120 @@
+/*
+ * Common code for the sigreturn entry points on the vsyscall page.
+ * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80)
+ * to enter the kernel.
+ * This file is #include'd by vsyscall-*.S to define them after the
+ * vsyscall entry point. The addresses we get for these entry points
+ * by doing ".balign 32" must match in both versions of the page.
+ */
+
+ .section .text.sigreturn,"ax"
+ .balign 32
+ .globl __kernel_sigreturn
+ .type __kernel_sigreturn,@function
+__kernel_sigreturn:
+.LSTART_sigreturn:
+ popl %eax
+ movl $__NR_ia32_sigreturn, %eax
+ SYSCALL_ENTER_KERNEL
+.LEND_sigreturn:
+ .size __kernel_sigreturn,.-.LSTART_sigreturn
+
+ .section .text.rtsigreturn,"ax"
+ .balign 32
+ .globl __kernel_rt_sigreturn
+ .type __kernel_rt_sigreturn,@function
+__kernel_rt_sigreturn:
+.LSTART_rt_sigreturn:
+ movl $__NR_ia32_rt_sigreturn, %eax
+ SYSCALL_ENTER_KERNEL
+.LEND_rt_sigreturn:
+ .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+
+ .section .eh_frame,"a",@progbits
+ .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */
+.LSTARTFDE2:
+ .long .LSTARTFDE2-.LSTARTFRAME /* CIE pointer */
+ /* HACK: The dwarf2 unwind routines will subtract 1 from the
+ return address to get an address in the middle of the
+ presumed call instruction. Since we didn't get here via
+ a call, we need to include the nop before the real start
+ to make up for it. */
+ .long .LSTART_sigreturn-1-. /* PC-relative start address */
+ .long .LEND_sigreturn-.LSTART_sigreturn+1
+ .uleb128 0 /* Augmentation length */
+ /* What follows are the instructions for the table generation.
+ We record the locations of each register saved. This is
+ complicated by the fact that the "CFA" is always assumed to
+ be the value of the stack pointer in the caller. This means
+ that we must define the CFA of this body of code to be the
+ saved value of the stack pointer in the sigcontext. Which
+ also means that there is no fixed relation to the other
+ saved registers, which means that we must use DW_CFA_expression
+ to compute their addresses. It also means that when we
+ adjust the stack with the popl, we have to do it all over again. */
+
+#define do_cfa_expr(offset) \
+ .byte 0x0f; /* DW_CFA_def_cfa_expression */ \
+ .uleb128 1f-0f; /* length */ \
+0: .byte 0x74; /* DW_OP_breg4 */ \
+ .sleb128 offset; /* offset */ \
+ .byte 0x06; /* DW_OP_deref */ \
+1:
+
+#define do_expr(regno, offset) \
+ .byte 0x10; /* DW_CFA_expression */ \
+ .uleb128 regno; /* regno */ \
+ .uleb128 1f-0f; /* length */ \
+0: .byte 0x74; /* DW_OP_breg4 */ \
+ .sleb128 offset; /* offset */ \
+1:
+
+ do_cfa_expr(IA32_SIGCONTEXT_esp+4)
+ do_expr(0, IA32_SIGCONTEXT_eax+4)
+ do_expr(1, IA32_SIGCONTEXT_ecx+4)
+ do_expr(2, IA32_SIGCONTEXT_edx+4)
+ do_expr(3, IA32_SIGCONTEXT_ebx+4)
+ do_expr(5, IA32_SIGCONTEXT_ebp+4)
+ do_expr(6, IA32_SIGCONTEXT_esi+4)
+ do_expr(7, IA32_SIGCONTEXT_edi+4)
+ do_expr(8, IA32_SIGCONTEXT_eip+4)
+
+ .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */
+
+ do_cfa_expr(IA32_SIGCONTEXT_esp)
+ do_expr(0, IA32_SIGCONTEXT_eax)
+ do_expr(1, IA32_SIGCONTEXT_ecx)
+ do_expr(2, IA32_SIGCONTEXT_edx)
+ do_expr(3, IA32_SIGCONTEXT_ebx)
+ do_expr(5, IA32_SIGCONTEXT_ebp)
+ do_expr(6, IA32_SIGCONTEXT_esi)
+ do_expr(7, IA32_SIGCONTEXT_edi)
+ do_expr(8, IA32_SIGCONTEXT_eip)
+
+ .align 4
+.LENDFDE2:
+
+ .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */
+.LSTARTFDE3:
+ .long .LSTARTFDE3-.LSTARTFRAME /* CIE pointer */
+ /* HACK: See above wrt unwind library assumptions. */
+ .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
+ .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
+ .uleb128 0 /* Augmentation */
+ /* What follows are the instructions for the table generation.
+ We record the locations of each register saved. This is
+ slightly less complicated than the above, since we don't
+ modify the stack pointer in the process. */
+
+ do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp)
+ do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax)
+ do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx)
+ do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx)
+ do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx)
+ do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp)
+ do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi)
+ do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi)
+ do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip)
+
+ .align 4
+.LENDFDE3:
diff --git a/arch/x86_64/ia32/vsyscall-syscall.S b/arch/x86_64/ia32/vsyscall-syscall.S
new file mode 100644
index 000000000000..e2aaf3de8a42
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-syscall.S
@@ -0,0 +1,68 @@
+/*
+ * Code for the vsyscall page. This version uses the syscall instruction.
+ */
+
+#include <asm/ia32_unistd.h>
+#include <asm/offset.h>
+#include <asm/segment.h>
+
+ .text
+ .section .text.vsyscall,"ax"
+ .globl __kernel_vsyscall
+ .type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+ push %ebp
+.Lpush_ebp:
+ movl %ecx, %ebp
+ syscall
+ movl $__USER32_DS, %ecx
+ movl %ecx, %ss
+ movl %ebp, %ecx
+ popl %ebp
+.Lpop_ebp:
+ ret
+.LEND_vsyscall:
+ .size __kernel_vsyscall,.-.LSTART_vsyscall
+
+ .section .eh_frame,"a",@progbits
+.LSTARTFRAME:
+ .long .LENDCIE-.LSTARTCIE
+.LSTARTCIE:
+ .long 0 /* CIE ID */
+ .byte 1 /* Version number */
+ .string "zR" /* NUL-terminated augmentation string */
+ .uleb128 1 /* Code alignment factor */
+ .sleb128 -4 /* Data alignment factor */
+ .byte 8 /* Return address register column */
+ .uleb128 1 /* Augmentation value length */
+ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+ .byte 0x0c /* DW_CFA_def_cfa */
+ .uleb128 4
+ .uleb128 4
+ .byte 0x88 /* DW_CFA_offset, column 0x8 */
+ .uleb128 1
+ .align 4
+.LENDCIE:
+
+ .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
+.LSTARTFDE1:
+ .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
+ .long .LSTART_vsyscall-. /* PC-relative start address */
+ .long .LEND_vsyscall-.LSTART_vsyscall
+ .uleb128 0 /* Augmentation length */
+ /* What follows are the instructions for the table generation.
+ We have to record all changes of the stack pointer. */
+ .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
+ .byte 0x0e /* DW_CFA_def_cfa_offset */
+ .uleb128 8
+ .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */
+ .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
+ .byte 0xc5 /* DW_CFA_restore %ebp */
+ .byte 0x0e /* DW_CFA_def_cfa_offset */
+ .uleb128 4
+ .align 4
+.LENDFDE1:
+
+#define SYSCALL_ENTER_KERNEL syscall
+#include "vsyscall-sigreturn.S"
diff --git a/arch/x86_64/ia32/vsyscall-sysenter.S b/arch/x86_64/ia32/vsyscall-sysenter.S
new file mode 100644
index 000000000000..8fb8e0ff3afa
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall-sysenter.S
@@ -0,0 +1,94 @@
+/*
+ * Code for the vsyscall page. This version uses the sysenter instruction.
+ */
+
+#include <asm/ia32_unistd.h>
+#include <asm/offset.h>
+
+ .text
+ .section .text.vsyscall,"ax"
+ .globl __kernel_vsyscall
+ .type __kernel_vsyscall,@function
+__kernel_vsyscall:
+.LSTART_vsyscall:
+ push %ecx
+.Lpush_ecx:
+ push %edx
+.Lpush_edx:
+ push %ebp
+.Lenter_kernel:
+ movl %esp,%ebp
+ sysenter
+ .space 7,0x90
+ jmp .Lenter_kernel
+ /* 16: System call normal return point is here! */
+ pop %ebp
+.Lpop_ebp:
+ pop %edx
+.Lpop_edx:
+ pop %ecx
+.Lpop_ecx:
+ ret
+.LEND_vsyscall:
+ .size __kernel_vsyscall,.-.LSTART_vsyscall
+
+ .section .eh_frame,"a",@progbits
+.LSTARTFRAME:
+ .long .LENDCIE-.LSTARTCIE
+.LSTARTCIE:
+ .long 0 /* CIE ID */
+ .byte 1 /* Version number */
+ .string "zR" /* NUL-terminated augmentation string */
+ .uleb128 1 /* Code alignment factor */
+ .sleb128 -4 /* Data alignment factor */
+ .byte 8 /* Return address register column */
+ .uleb128 1 /* Augmentation value length */
+ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+ .byte 0x0c /* DW_CFA_def_cfa */
+ .uleb128 4
+ .uleb128 4
+ .byte 0x88 /* DW_CFA_offset, column 0x8 */
+ .uleb128 1
+ .align 4
+.LENDCIE:
+
+ .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
+.LSTARTFDE1:
+ .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
+ .long .LSTART_vsyscall-. /* PC-relative start address */
+ .long .LEND_vsyscall-.LSTART_vsyscall
+ .uleb128 0 /* Augmentation length */
+ /* What follows are the instructions for the table generation.
+ We have to record all changes of the stack pointer. */
+ .byte 0x04 /* DW_CFA_advance_loc4 */
+ .long .Lpush_ecx-.LSTART_vsyscall
+ .byte 0x0e /* DW_CFA_def_cfa_offset */
+ .byte 0x08 /* RA at offset 8 now */
+ .byte 0x04 /* DW_CFA_advance_loc4 */
+ .long .Lpush_edx-.Lpush_ecx
+ .byte 0x0e /* DW_CFA_def_cfa_offset */
+ .byte 0x0c /* RA at offset 12 now */
+ .byte 0x04 /* DW_CFA_advance_loc4 */
+ .long .Lenter_kernel-.Lpush_edx
+ .byte 0x0e /* DW_CFA_def_cfa_offset */
+ .byte 0x10 /* RA at offset 16 now */
+ .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
+ /* Finally the epilogue. */
+ .byte 0x04 /* DW_CFA_advance_loc4 */
+ .long .Lpop_ebp-.Lenter_kernel
+ .byte 0x0e /* DW_CFA_def_cfa_offset */
+ .byte 0x12 /* RA at offset 12 now */
+ .byte 0xc5 /* DW_CFA_restore %ebp */
+ .byte 0x04 /* DW_CFA_advance_loc4 */
+ .long .Lpop_edx-.Lpop_ebp
+ .byte 0x0e /* DW_CFA_def_cfa_offset */
+ .byte 0x08 /* RA at offset 8 now */
+ .byte 0x04 /* DW_CFA_advance_loc4 */
+ .long .Lpop_ecx-.Lpop_edx
+ .byte 0x0e /* DW_CFA_def_cfa_offset */
+ .byte 0x04 /* RA at offset 4 now */
+ .align 4
+.LENDFDE1:
+
+#define SYSCALL_ENTER_KERNEL int $0x80
+#include "vsyscall-sigreturn.S"
diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds
new file mode 100644
index 000000000000..fa4b4dd4a9ff
--- /dev/null
+++ b/arch/x86_64/ia32/vsyscall.lds
@@ -0,0 +1,77 @@
+/*
+ * Linker script for vsyscall DSO. The vsyscall page is an ELF shared
+ * object prelinked to its virtual address. This script controls its layout.
+ */
+
+/* This must match <asm/fixmap.h>. */
+VSYSCALL_BASE = 0xffffe000;
+
+SECTIONS
+{
+ . = VSYSCALL_BASE + SIZEOF_HEADERS;
+
+ .hash : { *(.hash) } :text
+ .dynsym : { *(.dynsym) }
+ .dynstr : { *(.dynstr) }
+ .gnu.version : { *(.gnu.version) }
+ .gnu.version_d : { *(.gnu.version_d) }
+ .gnu.version_r : { *(.gnu.version_r) }
+
+ /* This linker script is used both with -r and with -shared.
+ For the layouts to match, we need to skip more than enough
+ space for the dynamic symbol table et al. If this amount
+ is insufficient, ld -shared will barf. Just increase it here. */
+ . = VSYSCALL_BASE + 0x400;
+
+ .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090
+
+ /* This is an 32bit object and we cannot easily get the offsets
+ into the 64bit kernel. Just hardcode them here. This assumes
+ that all the stubs don't need more than 0x100 bytes. */
+ . = VSYSCALL_BASE + 0x500;
+
+ .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090
+
+ . = VSYSCALL_BASE + 0x600;
+
+ .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090
+
+ .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr
+ .eh_frame : { KEEP (*(.eh_frame)) } :text
+ .dynamic : { *(.dynamic) } :text :dynamic
+ .useless : {
+ *(.got.plt) *(.got)
+ *(.data .data.* .gnu.linkonce.d.*)
+ *(.dynbss)
+ *(.bss .bss.* .gnu.linkonce.b.*)
+ } :text
+}
+
+/*
+ * We must supply the ELF program headers explicitly to get just one
+ * PT_LOAD segment, and set the flags explicitly to make segments read-only.
+ */
+PHDRS
+{
+ text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */
+ dynamic PT_DYNAMIC FLAGS(4); /* PF_R */
+ eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */
+}
+
+/*
+ * This controls what symbols we export from the DSO.
+ */
+VERSION
+{
+ LINUX_2.5 {
+ global:
+ __kernel_vsyscall;
+ __kernel_sigreturn;
+ __kernel_rt_sigreturn;
+
+ local: *;
+ };
+}
+
+/* The ELF entry point can be used to set the AT_SYSINFO value. */
+ENTRY(__kernel_vsyscall);
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
new file mode 100644
index 000000000000..0a3318e08ab6
--- /dev/null
+++ b/arch/x86_64/kernel/Makefile
@@ -0,0 +1,45 @@
+#
+# Makefile for the linux kernel.
+#
+
+extra-y := head.o head64.o init_task.o vmlinux.lds
+EXTRA_AFLAGS := -traditional
+obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
+ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
+ x8664_ksyms.o i387.o syscall.o vsyscall.o \
+ setup64.o bootflag.o e820.o reboot.o quirks.o
+
+obj-$(CONFIG_X86_MCE) += mce.o
+obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
+obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
+obj-$(CONFIG_ACPI_BOOT) += acpi/
+obj-$(CONFIG_X86_MSR) += msr.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+obj-$(CONFIG_X86_CPUID) += cpuid.o
+obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
+ genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_PM) += suspend.o
+obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq/
+obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
+obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
+obj-$(CONFIG_SWIOTLB) += swiotlb.o
+obj-$(CONFIG_KPROBES) += kprobes.o
+
+obj-$(CONFIG_MODULES) += module.o
+
+obj-y += topology.o
+obj-y += intel_cacheinfo.o
+
+CFLAGS_vsyscall.o := $(PROFILING) -g0
+
+bootflag-y += ../../i386/kernel/bootflag.o
+cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
+topology-y += ../../i386/mach-default/topology.o
+swiotlb-$(CONFIG_SWIOTLB) += ../../ia64/lib/swiotlb.o
+microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o
+intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
+quirks-y += ../../i386/kernel/quirks.o
diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile
new file mode 100644
index 000000000000..d2c2ee5f9a88
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_ACPI_BOOT) := boot.o
+boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
new file mode 100644
index 000000000000..7a275de6df22
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -0,0 +1,132 @@
+/*
+ * acpi.c - Architecture-Specific Low-Level ACPI Support
+ *
+ * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com>
+ * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
+ * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port)
+ * Copyright (C) 2003 Pavel Machek, SuSE Labs
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/bootmem.h>
+#include <linux/irq.h>
+#include <linux/acpi.h>
+#include <asm/mpspec.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/apicdef.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/tlbflush.h>
+
+
+/* --------------------------------------------------------------------------
+ Low-Level Sleep Support
+ -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI_SLEEP
+
+/* address in low memory of the wakeup routine. */
+unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_video_flags;
+extern char wakeup_start, wakeup_end;
+
+extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long));
+
+static pgd_t low_ptr;
+
+static void init_low_mapping(void)
+{
+ pgd_t *slot0 = pgd_offset(current->mm, 0UL);
+ low_ptr = *slot0;
+ set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET));
+ flush_tlb_all();
+}
+
+/**
+ * acpi_save_state_mem - save kernel state
+ *
+ * Create an identity mapped page table and copy the wakeup routine to
+ * low memory.
+ */
+int acpi_save_state_mem (void)
+{
+ init_low_mapping();
+
+ memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start);
+ acpi_copy_wakeup_routine(acpi_wakeup_address);
+
+ return 0;
+}
+
+/*
+ * acpi_restore_state
+ */
+void acpi_restore_state_mem (void)
+{
+ set_pgd(pgd_offset(current->mm, 0UL), low_ptr);
+ flush_tlb_all();
+}
+
+/**
+ * acpi_reserve_bootmem - do _very_ early ACPI initialisation
+ *
+ * We allocate a page in low memory for the wakeup
+ * routine for when we come back from a sleep state. The
+ * runtime allocator allows specification of <16M pages, but not
+ * <1M pages.
+ */
+void __init acpi_reserve_bootmem(void)
+{
+ acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE);
+ if ((&wakeup_end - &wakeup_start) > PAGE_SIZE)
+ printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n");
+}
+
+static int __init acpi_sleep_setup(char *str)
+{
+ while ((str != NULL) && (*str != '\0')) {
+ if (strncmp(str, "s3_bios", 7) == 0)
+ acpi_video_flags = 1;
+ if (strncmp(str, "s3_mode", 7) == 0)
+ acpi_video_flags |= 2;
+ str = strchr(str, ',');
+ if (str != NULL)
+ str += strspn(str, ", \t");
+ }
+ return 1;
+}
+
+__setup("acpi_sleep=", acpi_sleep_setup);
+
+#endif /*CONFIG_ACPI_SLEEP*/
+
+void acpi_pci_link_exit(void) {}
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
new file mode 100644
index 000000000000..a4c630034cd4
--- /dev/null
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -0,0 +1,527 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+
+# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
+#
+# wakeup_code runs in real mode, and at unknown address (determined at run-time).
+# Therefore it must only use relative jumps/calls.
+#
+# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
+#
+# If physical address of wakeup_code is 0x12345, BIOS should call us with
+# cs = 0x1234, eip = 0x05
+#
+
+
+ALIGN
+ .align 16
+ENTRY(wakeup_start)
+wakeup_code:
+ wakeup_code_start = .
+ .code16
+
+# Running in *copy* of this code, somewhere in low 1MB.
+
+ movb $0xa1, %al ; outb %al, $0x80
+ cli
+ cld
+ # setup data segment
+ movw %cs, %ax
+ movw %ax, %ds # Make ds:0 point to wakeup_start
+ movw %ax, %ss
+ mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board
+
+ pushl $0 # Kill any dangerous flags
+ popfl
+
+ movl real_magic - wakeup_code, %eax
+ cmpl $0x12345678, %eax
+ jne bogus_real_magic
+
+ testl $1, video_flags - wakeup_code
+ jz 1f
+ lcall $0xc000,$3
+ movw %cs, %ax
+ movw %ax, %ds # Bios might have played with that
+ movw %ax, %ss
+1:
+
+ testl $2, video_flags - wakeup_code
+ jz 1f
+ mov video_mode - wakeup_code, %ax
+ call mode_seta
+1:
+
+ movw $0xb800, %ax
+ movw %ax,%fs
+ movw $0x0e00 + 'L', %fs:(0x10)
+
+ movb $0xa2, %al ; outb %al, $0x80
+
+ lidt %ds:idt_48a - wakeup_code
+ xorl %eax, %eax
+ movw %ds, %ax # (Convert %ds:gdt to a linear ptr)
+ shll $4, %eax
+ addl $(gdta - wakeup_code), %eax
+ movl %eax, gdt_48a +2 - wakeup_code
+ lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is
+ # appropriate
+
+ movl $1, %eax # protected mode (PE) bit
+ lmsw %ax # This is it!
+ jmp 1f
+1:
+
+ .byte 0x66, 0xea # prefix + jmpi-opcode
+ .long wakeup_32 - __START_KERNEL_map
+ .word __KERNEL_CS
+
+ .code32
+wakeup_32:
+# Running in this code, but at low address; paging is not yet turned on.
+ movb $0xa5, %al ; outb %al, $0x80
+
+ /* Check if extended functions are implemented */
+ movl $0x80000000, %eax
+ cpuid
+ cmpl $0x80000000, %eax
+ jbe bogus_cpu
+ wbinvd
+ mov $0x80000001, %eax
+ cpuid
+ btl $29, %edx
+ jnc bogus_cpu
+ movl %edx,%edi
+
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+
+ mov $(wakeup_stack - __START_KERNEL_map), %esp
+ movl saved_magic - __START_KERNEL_map, %eax
+ cmpl $0x9abcdef0, %eax
+ jne bogus_32_magic
+
+ /*
+ * Prepare for entering 64bits mode
+ */
+
+ /* Enable PAE mode and PGE */
+ xorl %eax, %eax
+ btsl $5, %eax
+ btsl $7, %eax
+ movl %eax, %cr4
+
+ /* Setup early boot stage 4 level pagetables */
+ movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax
+ movl %eax, %cr3
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ /* Fool rdmsr and reset %eax to avoid dependences */
+ xorl %eax, %eax
+ /* Enable Long Mode */
+ btsl $_EFER_LME, %eax
+ /* Enable System Call */
+ btsl $_EFER_SCE, %eax
+
+ /* No Execute supported? */
+ btl $20,%edi
+ jnc 1f
+ btsl $_EFER_NX, %eax
+1:
+
+ /* Make changes effective */
+ wrmsr
+ wbinvd
+
+ xorl %eax, %eax
+ btsl $31, %eax /* Enable paging and in turn activate Long Mode */
+ btsl $0, %eax /* Enable protected mode */
+ btsl $1, %eax /* Enable MP */
+ btsl $4, %eax /* Enable ET */
+ btsl $5, %eax /* Enable NE */
+ btsl $16, %eax /* Enable WP */
+ btsl $18, %eax /* Enable AM */
+
+ /* Make changes effective */
+ movl %eax, %cr0
+ /* At this point:
+ CR4.PAE must be 1
+ CS.L must be 0
+ CR3 must point to PML4
+ Next instruction must be a branch
+ This must be on identity-mapped page
+ */
+ jmp reach_compatibility_mode
+reach_compatibility_mode:
+ movw $0x0e00 + 'i', %ds:(0xb8012)
+ movb $0xa8, %al ; outb %al, $0x80;
+
+ /*
+ * At this point we're in long mode but in 32bit compatibility mode
+ * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+ * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load
+ * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+ */
+
+ movw $0x0e00 + 'n', %ds:(0xb8014)
+ movb $0xa9, %al ; outb %al, $0x80
+
+ /* Load new GDT with the 64bit segment using 32bit descriptor */
+ movl $(pGDT32 - __START_KERNEL_map), %eax
+ lgdt (%eax)
+
+ movl $(wakeup_jumpvector - __START_KERNEL_map), %eax
+ /* Finally jump in 64bit mode */
+ ljmp *(%eax)
+
+wakeup_jumpvector:
+ .long wakeup_long64 - __START_KERNEL_map
+ .word __KERNEL_CS
+
+.code64
+
+ /* Hooray, we are in Long 64-bit mode (but still running in low memory) */
+wakeup_long64:
+ /*
+ * We must switch to a new descriptor in kernel space for the GDT
+ * because soon the kernel won't have access anymore to the userspace
+ * addresses where we're currently running on. We have to do that here
+ * because in 32bit we couldn't load a 64bit linear address.
+ */
+ lgdt cpu_gdt_descr - __START_KERNEL_map
+
+ movw $0x0e00 + 'u', %ds:(0xb8016)
+
+ nop
+ nop
+ movw $__KERNEL_DS, %ax
+ movw %ax, %ss
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %fs
+ movw %ax, %gs
+ movq saved_esp, %rsp
+
+ movw $0x0e00 + 'x', %ds:(0xb8018)
+ movq saved_ebx, %rbx
+ movq saved_edi, %rdi
+ movq saved_esi, %rsi
+ movq saved_ebp, %rbp
+
+ movw $0x0e00 + '!', %ds:(0xb801a)
+ movq saved_eip, %rax
+ jmp *%rax
+
+.code32
+
+ .align 64
+gdta:
+ .word 0, 0, 0, 0 # dummy
+
+ .word 0, 0, 0, 0 # unused
+
+ .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
+ .word 0 # base address = 0
+ .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?)
+ .word 0x00CF # granularity = 4096, 386
+ # (+5th nibble of limit)
+
+ .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb)
+ .word 0 # base address = 0
+ .word 0x9200 # data read/write
+ .word 0x00CF # granularity = 4096, 386
+ # (+5th nibble of limit)
+# this is 64bit descriptor for code
+ .word 0xFFFF
+ .word 0
+ .word 0x9A00 # code read/exec
+ .word 0x00AF # as above, but it is long mode and with D=0
+
+idt_48a:
+ .word 0 # idt limit = 0
+ .word 0, 0 # idt base = 0L
+
+gdt_48a:
+ .word 0x8000 # gdt limit=2048,
+ # 256 GDT entries
+ .word 0, 0 # gdt base (filled in later)
+
+
+real_save_gdt: .word 0
+ .quad 0
+real_magic: .quad 0
+video_mode: .quad 0
+video_flags: .quad 0
+
+bogus_real_magic:
+ movb $0xba,%al ; outb %al,$0x80
+ jmp bogus_real_magic
+
+bogus_32_magic:
+ movb $0xb3,%al ; outb %al,$0x80
+ jmp bogus_32_magic
+
+bogus_31_magic:
+ movb $0xb1,%al ; outb %al,$0x80
+ jmp bogus_31_magic
+
+bogus_cpu:
+ movb $0xbc,%al ; outb %al,$0x80
+ jmp bogus_cpu
+
+
+/* This code uses an extended set of video mode numbers. These include:
+ * Aliases for standard modes
+ * NORMAL_VGA (-1)
+ * EXTENDED_VGA (-2)
+ * ASK_VGA (-3)
+ * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
+ * of compatibility when extending the table. These are between 0x00 and 0xff.
+ */
+#define VIDEO_FIRST_MENU 0x0000
+
+/* Standard BIOS video modes (BIOS number + 0x0100) */
+#define VIDEO_FIRST_BIOS 0x0100
+
+/* VESA BIOS video modes (VESA number + 0x0200) */
+#define VIDEO_FIRST_VESA 0x0200
+
+/* Video7 special modes (BIOS number + 0x0900) */
+#define VIDEO_FIRST_V7 0x0900
+
+# Setting of user mode (AX=mode ID) => CF=success
+mode_seta:
+ movw %ax, %bx
+#if 0
+ cmpb $0xff, %ah
+ jz setalias
+
+ testb $VIDEO_RECALC>>8, %ah
+ jnz _setrec
+
+ cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah
+ jnc setres
+
+ cmpb $VIDEO_FIRST_SPECIAL>>8, %ah
+ jz setspc
+
+ cmpb $VIDEO_FIRST_V7>>8, %ah
+ jz setv7
+#endif
+
+ cmpb $VIDEO_FIRST_VESA>>8, %ah
+ jnc check_vesaa
+#if 0
+ orb %ah, %ah
+ jz setmenu
+#endif
+
+ decb %ah
+# jz setbios Add bios modes later
+
+setbada: clc
+ ret
+
+check_vesaa:
+ subb $VIDEO_FIRST_VESA>>8, %bh
+ orw $0x4000, %bx # Use linear frame buffer
+ movw $0x4f02, %ax # VESA BIOS mode set call
+ int $0x10
+ cmpw $0x004f, %ax # AL=4f if implemented
+ jnz _setbada # AH=0 if OK
+
+ stc
+ ret
+
+_setbada: jmp setbada
+
+ .code64
+bogus_magic:
+ movw $0x0e00 + 'B', %ds:(0xb8018)
+ jmp bogus_magic
+
+bogus_magic2:
+ movw $0x0e00 + '2', %ds:(0xb8018)
+ jmp bogus_magic2
+
+
+wakeup_stack_begin: # Stack grows down
+
+.org 0xff0
+wakeup_stack: # Just below end of page
+
+ENTRY(wakeup_end)
+
+##
+# acpi_copy_wakeup_routine
+#
+# Copy the above routine to low memory.
+#
+# Parameters:
+# %rdi: place to copy wakeup routine to
+#
+# Returned address is location of code in low memory (past data and stack)
+#
+ENTRY(acpi_copy_wakeup_routine)
+ pushq %rax
+ pushq %rcx
+ pushq %rdx
+
+ sgdt saved_gdt
+ sidt saved_idt
+ sldt saved_ldt
+ str saved_tss
+
+ movq %cr3, %rdx
+ movq %rdx, saved_cr3
+ movq %cr4, %rdx
+ movq %rdx, saved_cr4
+ movq %cr0, %rdx
+ movq %rdx, saved_cr0
+ sgdt real_save_gdt - wakeup_start (,%rdi)
+ movl $MSR_EFER, %ecx
+ rdmsr
+ movl %eax, saved_efer
+ movl %edx, saved_efer2
+
+ movl saved_video_mode, %edx
+ movl %edx, video_mode - wakeup_start (,%rdi)
+ movl acpi_video_flags, %edx
+ movl %edx, video_flags - wakeup_start (,%rdi)
+ movq $0x12345678, real_magic - wakeup_start (,%rdi)
+ movq $0x123456789abcdef0, %rdx
+ movq %rdx, saved_magic
+
+ movl saved_magic - __START_KERNEL_map, %eax
+ cmpl $0x9abcdef0, %eax
+ jne bogus_32_magic
+
+ # make sure %cr4 is set correctly (features, etc)
+ movl saved_cr4 - __START_KERNEL_map, %eax
+ movq %rax, %cr4
+
+ movl saved_cr0 - __START_KERNEL_map, %eax
+ movq %rax, %cr0
+ jmp 1f # Flush pipelines
+1:
+ # restore the regs we used
+ popq %rdx
+ popq %rcx
+ popq %rax
+ENTRY(do_suspend_lowlevel_s4bios)
+ ret
+
+ .align 2
+ .p2align 4,,15
+.globl do_suspend_lowlevel
+ .type do_suspend_lowlevel,@function
+do_suspend_lowlevel:
+.LFB5:
+ subq $8, %rsp
+ xorl %eax, %eax
+ call save_processor_state
+
+ movq %rsp, saved_context_esp(%rip)
+ movq %rax, saved_context_eax(%rip)
+ movq %rbx, saved_context_ebx(%rip)
+ movq %rcx, saved_context_ecx(%rip)
+ movq %rdx, saved_context_edx(%rip)
+ movq %rbp, saved_context_ebp(%rip)
+ movq %rsi, saved_context_esi(%rip)
+ movq %rdi, saved_context_edi(%rip)
+ movq %r8, saved_context_r08(%rip)
+ movq %r9, saved_context_r09(%rip)
+ movq %r10, saved_context_r10(%rip)
+ movq %r11, saved_context_r11(%rip)
+ movq %r12, saved_context_r12(%rip)
+ movq %r13, saved_context_r13(%rip)
+ movq %r14, saved_context_r14(%rip)
+ movq %r15, saved_context_r15(%rip)
+ pushfq ; popq saved_context_eflags(%rip)
+
+ movq $.L97, saved_eip(%rip)
+
+ movq %rsp,saved_esp
+ movq %rbp,saved_ebp
+ movq %rbx,saved_ebx
+ movq %rdi,saved_edi
+ movq %rsi,saved_esi
+
+ addq $8, %rsp
+ movl $3, %edi
+ xorl %eax, %eax
+ jmp acpi_enter_sleep_state
+.L97:
+ .p2align 4,,7
+.L99:
+ .align 4
+ movl $24, %eax
+ movw %ax, %ds
+ movq saved_context+58(%rip), %rax
+ movq %rax, %cr4
+ movq saved_context+50(%rip), %rax
+ movq %rax, %cr3
+ movq saved_context+42(%rip), %rax
+ movq %rax, %cr2
+ movq saved_context+34(%rip), %rax
+ movq %rax, %cr0
+ pushq saved_context_eflags(%rip) ; popfq
+ movq saved_context_esp(%rip), %rsp
+ movq saved_context_ebp(%rip), %rbp
+ movq saved_context_eax(%rip), %rax
+ movq saved_context_ebx(%rip), %rbx
+ movq saved_context_ecx(%rip), %rcx
+ movq saved_context_edx(%rip), %rdx
+ movq saved_context_esi(%rip), %rsi
+ movq saved_context_edi(%rip), %rdi
+ movq saved_context_r08(%rip), %r8
+ movq saved_context_r09(%rip), %r9
+ movq saved_context_r10(%rip), %r10
+ movq saved_context_r11(%rip), %r11
+ movq saved_context_r12(%rip), %r12
+ movq saved_context_r13(%rip), %r13
+ movq saved_context_r14(%rip), %r14
+ movq saved_context_r15(%rip), %r15
+
+ xorl %eax, %eax
+ addq $8, %rsp
+ jmp restore_processor_state
+.LFE5:
+.Lfe5:
+ .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel
+
+.data
+ALIGN
+ENTRY(saved_ebp) .quad 0
+ENTRY(saved_esi) .quad 0
+ENTRY(saved_edi) .quad 0
+ENTRY(saved_ebx) .quad 0
+
+ENTRY(saved_eip) .quad 0
+ENTRY(saved_esp) .quad 0
+
+ENTRY(saved_magic) .quad 0
+
+ALIGN
+# saved registers
+saved_gdt: .quad 0,0
+saved_idt: .quad 0,0
+saved_ldt: .quad 0
+saved_tss: .quad 0
+
+saved_cr0: .quad 0
+saved_cr3: .quad 0
+saved_cr4: .quad 0
+saved_efer: .quad 0
+saved_efer2: .quad 0
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
new file mode 100644
index 000000000000..4baa99fe1e5c
--- /dev/null
+++ b/arch/x86_64/kernel/aperture.c
@@ -0,0 +1,286 @@
+/*
+ * Firmware replacement code.
+ *
+ * Work around broken BIOSes that don't set an aperture or only set the
+ * aperture in the AGP bridge.
+ * If all fails map the aperture over some low memory. This is cheaper than
+ * doing bounce buffering. The memory is lost. This is done at early boot
+ * because only the bootmem allocator can allocate 32+MB.
+ *
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/mmzone.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/bitops.h>
+#include <asm/e820.h>
+#include <asm/io.h>
+#include <asm/proto.h>
+#include <asm/pci-direct.h>
+
+int iommu_aperture;
+int iommu_aperture_disabled __initdata = 0;
+int iommu_aperture_allowed __initdata = 0;
+
+int fallback_aper_order __initdata = 1; /* 64MB */
+int fallback_aper_force __initdata = 0;
+
+int fix_aperture __initdata = 1;
+
+/* This code runs before the PCI subsystem is initialized, so just
+ access the northbridge directly. */
+
+#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
+
+static u32 __init allocate_aperture(void)
+{
+#ifdef CONFIG_DISCONTIGMEM
+ pg_data_t *nd0 = NODE_DATA(0);
+#else
+ pg_data_t *nd0 = &contig_page_data;
+#endif
+ u32 aper_size;
+ void *p;
+
+ if (fallback_aper_order > 7)
+ fallback_aper_order = 7;
+ aper_size = (32 * 1024 * 1024) << fallback_aper_order;
+
+ /*
+ * Aperture has to be naturally aligned. This means an 2GB aperture won't
+ * have much chances to find a place in the lower 4GB of memory.
+ * Unfortunately we cannot move it up because that would make the
+ * IOMMU useless.
+ */
+ p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0);
+ if (!p || __pa(p)+aper_size > 0xffffffff) {
+ printk("Cannot allocate aperture memory hole (%p,%uK)\n",
+ p, aper_size>>10);
+ if (p)
+ free_bootmem_node(nd0, (unsigned long)p, aper_size);
+ return 0;
+ }
+ printk("Mapping aperture over %d KB of RAM @ %lx\n",
+ aper_size >> 10, __pa(p));
+ return (u32)__pa(p);
+}
+
+static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
+{
+ if (!aper_base)
+ return 0;
+ if (aper_size < 64*1024*1024) {
+ printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20);
+ return 0;
+ }
+ if (aper_base + aper_size >= 0xffffffff) {
+ printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
+ return 0;
+ }
+ if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+ printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
+ return 0;
+ }
+ return 1;
+}
+
+/* Find a PCI capability */
+static __u32 __init find_cap(int num, int slot, int func, int cap)
+{
+ u8 pos;
+ int bytes;
+ if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST))
+ return 0;
+ pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST);
+ for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+ u8 id;
+ pos &= ~3;
+ id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID);
+ if (id == 0xff)
+ break;
+ if (id == cap)
+ return pos;
+ pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT);
+ }
+ return 0;
+}
+
+/* Read a standard AGPv3 bridge header */
+static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
+{
+ u32 apsize;
+ u32 apsizereg;
+ int nbits;
+ u32 aper_low, aper_hi;
+ u64 aper;
+
+ printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func);
+ apsizereg = read_pci_config_16(num,slot,func, cap + 0x14);
+ if (apsizereg == 0xffffffff) {
+ printk("APSIZE in AGP bridge unreadable\n");
+ return 0;
+ }
+
+ apsize = apsizereg & 0xfff;
+ /* Some BIOS use weird encodings not in the AGPv3 table. */
+ if (apsize & 0xff)
+ apsize |= 0xf00;
+ nbits = hweight16(apsize);
+ *order = 7 - nbits;
+ if ((int)*order < 0) /* < 32MB */
+ *order = 0;
+
+ aper_low = read_pci_config(num,slot,func, 0x10);
+ aper_hi = read_pci_config(num,slot,func,0x14);
+ aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32);
+
+ printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
+ aper, 32 << *order, apsizereg);
+
+ if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order))
+ return 0;
+ return (u32)aper;
+}
+
+/* Look for an AGP bridge. Windows only expects the aperture in the
+ AGP bridge and some BIOS forget to initialize the Northbridge too.
+ Work around this here.
+
+ Do an PCI bus scan by hand because we're running before the PCI
+ subsystem.
+
+ All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ generically. It's probably overkill to always scan all slots because
+ the AGP bridges should be always an own bus on the HT hierarchy,
+ but do it here for future safety. */
+static __u32 __init search_agp_bridge(u32 *order, int *valid_agp)
+{
+ int num, slot, func;
+
+ /* Poor man's PCI discovery */
+ for (num = 0; num < 32; num++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u32 class, cap;
+ u8 type;
+ class = read_pci_config(num,slot,func,
+ PCI_CLASS_REVISION);
+ if (class == 0xffffffff)
+ break;
+
+ switch (class >> 16) {
+ case PCI_CLASS_BRIDGE_HOST:
+ case PCI_CLASS_BRIDGE_OTHER: /* needed? */
+ /* AGP bridge? */
+ cap = find_cap(num,slot,func,PCI_CAP_ID_AGP);
+ if (!cap)
+ break;
+ *valid_agp = 1;
+ return read_agp(num,slot,func,cap,order);
+ }
+
+ /* No multi-function device? */
+ type = read_pci_config_byte(num,slot,func,
+ PCI_HEADER_TYPE);
+ if (!(type & 0x80))
+ break;
+ }
+ }
+ }
+ printk("No AGP bridge found\n");
+ return 0;
+}
+
+void __init iommu_hole_init(void)
+{
+ int fix, num;
+ u32 aper_size, aper_alloc = 0, aper_order, last_aper_order = 0;
+ u64 aper_base, last_aper_base = 0;
+ int valid_agp = 0;
+
+ if (iommu_aperture_disabled || !fix_aperture)
+ return;
+
+ printk("Checking aperture...\n");
+
+ fix = 0;
+ for (num = 24; num < 32; num++) {
+ char name[30];
+ if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
+ continue;
+
+ iommu_aperture = 1;
+
+ aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
+ aper_size = (32 * 1024 * 1024) << aper_order;
+ aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff;
+ aper_base <<= 25;
+
+ printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
+ aper_base, aper_size>>20);
+
+ sprintf(name, "northbridge cpu %d", num-24);
+
+ if (!aperture_valid(name, aper_base, aper_size)) {
+ fix = 1;
+ break;
+ }
+
+ if ((last_aper_order && aper_order != last_aper_order) ||
+ (last_aper_base && aper_base != last_aper_base)) {
+ fix = 1;
+ break;
+ }
+ last_aper_order = aper_order;
+ last_aper_base = aper_base;
+ }
+
+ if (!fix && !fallback_aper_force)
+ return;
+
+ if (!fallback_aper_force)
+ aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
+
+ if (aper_alloc) {
+ /* Got the aperture from the AGP bridge */
+ } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) ||
+ force_iommu ||
+ valid_agp ||
+ fallback_aper_force) {
+ printk("Your BIOS doesn't leave a aperture memory hole\n");
+ printk("Please enable the IOMMU option in the BIOS setup\n");
+ printk("This costs you %d MB of RAM\n",
+ 32 << fallback_aper_order);
+
+ aper_order = fallback_aper_order;
+ aper_alloc = allocate_aperture();
+ if (!aper_alloc) {
+ /* Could disable AGP and IOMMU here, but it's probably
+ not worth it. But the later users cannot deal with
+ bad apertures and turning on the aperture over memory
+ causes very strange problems, so it's better to
+ panic early. */
+ panic("Not enough memory for aperture");
+ }
+ } else {
+ return;
+ }
+
+ /* Fix up the north bridges */
+ for (num = 24; num < 32; num++) {
+ if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
+ continue;
+
+ /* Don't enable translation yet. That is done later.
+ Assume this BIOS didn't initialise the GART so
+ just overwrite all previous bits */
+ write_pci_config(0, num, 3, 0x90, aper_order<<1);
+ write_pci_config(0, num, 3, 0x94, aper_alloc>>25);
+ }
+}
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
new file mode 100644
index 000000000000..9826f03dc9a3
--- /dev/null
+++ b/arch/x86_64/kernel/apic.c
@@ -0,0 +1,1088 @@
+/*
+ * Local APIC handling, local APIC timers
+ *
+ * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively.
+ * Maciej W. Rozycki : Various updates and fixes.
+ * Mikael Pettersson : Power Management for UP-APIC.
+ * Pavel Machek and
+ * Mikael Pettersson : PM converted to driver model.
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
+#include <asm/mach_apic.h>
+
+int apic_verbosity;
+
+int disable_apic_timer __initdata;
+
+/* Using APIC to generate smp_local_timer_interrupt? */
+int using_apic_timer = 0;
+
+static DEFINE_PER_CPU(int, prof_multiplier) = 1;
+static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
+static DEFINE_PER_CPU(int, prof_counter) = 1;
+
+static void apic_pm_activate(void);
+
+void enable_NMI_through_LVT0 (void * dummy)
+{
+ unsigned int v, ver;
+
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+ v = APIC_DM_NMI; /* unmask and set to NMI */
+ apic_write_around(APIC_LVT0, v);
+}
+
+int get_maxlvt(void)
+{
+ unsigned int v, ver, maxlvt;
+
+ v = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = GET_APIC_MAXLVT(v);
+ return maxlvt;
+}
+
+void clear_local_APIC(void)
+{
+ int maxlvt;
+ unsigned int v;
+
+ maxlvt = get_maxlvt();
+
+ /*
+ * Masking an LVT entry on a P6 can trigger a local APIC error
+ * if the vector is zero. Mask LVTERR first to prevent this.
+ */
+ if (maxlvt >= 3) {
+ v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
+ apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+ }
+ /*
+ * Careful: we have to set masks only first to deassert
+ * any level-triggered sources.
+ */
+ v = apic_read(APIC_LVTT);
+ apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+ v = apic_read(APIC_LVT0);
+ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+ v = apic_read(APIC_LVT1);
+ apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+ if (maxlvt >= 4) {
+ v = apic_read(APIC_LVTPC);
+ apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+ }
+
+ /*
+ * Clean APIC state for other OSs:
+ */
+ apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+ apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+ if (maxlvt >= 3)
+ apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+ if (maxlvt >= 4)
+ apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+ v = GET_APIC_VERSION(apic_read(APIC_LVR));
+ if (APIC_INTEGRATED(v)) { /* !82489DX */
+ if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
+}
+
+void __init connect_bsp_APIC(void)
+{
+ if (pic_mode) {
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+ /*
+ * PIC mode, enable APIC mode in the IMCR, i.e.
+ * connect BSP's local APIC to INT and NMI lines.
+ */
+ apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
+ outb(0x70, 0x22);
+ outb(0x01, 0x23);
+ }
+}
+
+void disconnect_bsp_APIC(void)
+{
+ if (pic_mode) {
+ /*
+ * Put the board back into PIC mode (has an effect
+ * only on certain older boards). Note that APIC
+ * interrupts, including IPIs, won't work beyond
+ * this point! The only exception are INIT IPIs.
+ */
+ apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n");
+ outb(0x70, 0x22);
+ outb(0x00, 0x23);
+ }
+}
+
+void disable_local_APIC(void)
+{
+ unsigned int value;
+
+ clear_local_APIC();
+
+ /*
+ * Disable APIC (implies clearing of registers
+ * for 82489DX!).
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_SPIV_APIC_ENABLED;
+ apic_write_around(APIC_SPIV, value);
+}
+
+/*
+ * This is to verify that we're looking at a real local APIC.
+ * Check these against your board if the CPUs aren't getting
+ * started for no apparent reason.
+ */
+int __init verify_local_APIC(void)
+{
+ unsigned int reg0, reg1;
+
+ /*
+ * The version register is read-only in a real APIC.
+ */
+ reg0 = apic_read(APIC_LVR);
+ apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
+ apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
+ reg1 = apic_read(APIC_LVR);
+ apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
+
+ /*
+ * The two version reads above should print the same
+ * numbers. If the second one is different, then we
+ * poke at a non-APIC.
+ */
+ if (reg1 != reg0)
+ return 0;
+
+ /*
+ * Check if the version looks reasonably.
+ */
+ reg1 = GET_APIC_VERSION(reg0);
+ if (reg1 == 0x00 || reg1 == 0xff)
+ return 0;
+ reg1 = get_maxlvt();
+ if (reg1 < 0x02 || reg1 == 0xff)
+ return 0;
+
+ /*
+ * The ID register is read/write in a real APIC.
+ */
+ reg0 = apic_read(APIC_ID);
+ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+ apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+ reg1 = apic_read(APIC_ID);
+ apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+ apic_write(APIC_ID, reg0);
+ if (reg1 != (reg0 ^ APIC_ID_MASK))
+ return 0;
+
+ /*
+ * The next two are just to see if we have sane values.
+ * They're only really relevant if we're in Virtual Wire
+ * compatibility mode, but most boxes are anymore.
+ */
+ reg0 = apic_read(APIC_LVT0);
+ apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0);
+ reg1 = apic_read(APIC_LVT1);
+ apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
+
+ return 1;
+}
+
+void __init sync_Arb_IDs(void)
+{
+ /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
+ unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+ if (ver >= 0x14) /* P4 or higher */
+ return;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+
+ apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
+ apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
+ | APIC_DM_INIT);
+}
+
+extern void __error_in_apic_c (void);
+
+/*
+ * An initial setup of the virtual wire mode.
+ */
+void __init init_bsp_APIC(void)
+{
+ unsigned int value, ver;
+
+ /*
+ * Don't do the setup now if we have a SMP BIOS as the
+ * through-I/O-APIC virtual wire mode might be active.
+ */
+ if (smp_found_config || !cpu_has_apic)
+ return;
+
+ value = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(value);
+
+ /*
+ * Do not trust the local APIC being empty at bootup.
+ */
+ clear_local_APIC();
+
+ /*
+ * Enable APIC.
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= APIC_SPIV_FOCUS_DISABLED;
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write_around(APIC_SPIV, value);
+
+ /*
+ * Set up the virtual wire mode.
+ */
+ apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+ value = APIC_DM_NMI;
+ if (!APIC_INTEGRATED(ver)) /* 82489DX */
+ value |= APIC_LVT_LEVEL_TRIGGER;
+ apic_write_around(APIC_LVT1, value);
+}
+
+void __init setup_local_APIC (void)
+{
+ unsigned int value, ver, maxlvt;
+
+ /* Pound the ESR really hard over the head with a big hammer - mbligh */
+ if (esr_disable) {
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ apic_write(APIC_ESR, 0);
+ }
+
+ value = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(value);
+
+ if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
+ __error_in_apic_c();
+
+ /*
+ * Double-check whether this APIC is really registered.
+ * This is meaningless in clustered apic mode, so we skip it.
+ */
+ if (!apic_id_registered())
+ BUG();
+
+ /*
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+ init_apic_ldr();
+
+ /*
+ * Set Task Priority to 'accept all'. We never change this
+ * later on.
+ */
+ value = apic_read(APIC_TASKPRI);
+ value &= ~APIC_TPRI_MASK;
+ apic_write_around(APIC_TASKPRI, value);
+
+ /*
+ * Now that we are all set up, enable the APIC
+ */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ /*
+ * Enable APIC
+ */
+ value |= APIC_SPIV_APIC_ENABLED;
+
+ /*
+ * Some unknown Intel IO/APIC (or APIC) errata is biting us with
+ * certain networking cards. If high frequency interrupts are
+ * happening on a particular IOAPIC pin, plus the IOAPIC routing
+ * entry is masked/unmasked at a high rate as well then sooner or
+ * later IOAPIC line gets 'stuck', no more interrupts are received
+ * from the device. If focus CPU is disabled then the hang goes
+ * away, oh well :-(
+ *
+ * [ This bug can be reproduced easily with a level-triggered
+ * PCI Ne2000 networking cards and PII/PIII processors, dual
+ * BX chipset. ]
+ */
+ /*
+ * Actually disabling the focus CPU check just makes the hang less
+ * frequent as it makes the interrupt distributon model be more
+ * like LRU than MRU (the short-term load is more even across CPUs).
+ * See also the comment in end_level_ioapic_irq(). --macro
+ */
+#if 1
+ /* Enable focus processor (bit==0) */
+ value &= ~APIC_SPIV_FOCUS_DISABLED;
+#else
+ /* Disable focus processor (bit==1) */
+ value |= APIC_SPIV_FOCUS_DISABLED;
+#endif
+ /*
+ * Set spurious IRQ vector
+ */
+ value |= SPURIOUS_APIC_VECTOR;
+ apic_write_around(APIC_SPIV, value);
+
+ /*
+ * Set up LVT0, LVT1:
+ *
+ * set up through-local-APIC on the BP's LINT0. This is not
+ * strictly necessary in pure symmetric-IO mode, but sometimes
+ * we delegate interrupts to the 8259A.
+ */
+ /*
+ * TODO: set up through-local-APIC from through-I/O-APIC? --macro
+ */
+ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
+ if (!smp_processor_id() && (pic_mode || !value)) {
+ value = APIC_DM_EXTINT;
+ apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
+ } else {
+ value = APIC_DM_EXTINT | APIC_LVT_MASKED;
+ apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id());
+ }
+ apic_write_around(APIC_LVT0, value);
+
+ /*
+ * only the BP should see the LINT1 NMI signal, obviously.
+ */
+ if (!smp_processor_id())
+ value = APIC_DM_NMI;
+ else
+ value = APIC_DM_NMI | APIC_LVT_MASKED;
+ if (!APIC_INTEGRATED(ver)) /* 82489DX */
+ value |= APIC_LVT_LEVEL_TRIGGER;
+ apic_write_around(APIC_LVT1, value);
+
+ if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */
+ unsigned oldvalue;
+ maxlvt = get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+ value = ERROR_APIC_VECTOR; // enables sending errors
+ apic_write_around(APIC_LVTERR, value);
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE,
+ "ESR value after enabling vector: %08x, after %08x\n",
+ oldvalue, value);
+ } else {
+ if (esr_disable)
+ /*
+ * Something untraceble is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
+ */
+ apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n");
+ else
+ apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n");
+ }
+
+ nmi_watchdog_default();
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ setup_apic_nmi_watchdog();
+ apic_pm_activate();
+}
+
+#ifdef CONFIG_PM
+
+static struct {
+ /* 'active' is true if the local APIC was enabled by us and
+ not the BIOS; this signifies that we are also responsible
+ for disabling it before entering apm/acpi suspend */
+ int active;
+ /* r/w apic fields */
+ unsigned int apic_id;
+ unsigned int apic_taskpri;
+ unsigned int apic_ldr;
+ unsigned int apic_dfr;
+ unsigned int apic_spiv;
+ unsigned int apic_lvtt;
+ unsigned int apic_lvtpc;
+ unsigned int apic_lvt0;
+ unsigned int apic_lvt1;
+ unsigned int apic_lvterr;
+ unsigned int apic_tmict;
+ unsigned int apic_tdcr;
+ unsigned int apic_thmr;
+} apic_pm_state;
+
+static int lapic_suspend(struct sys_device *dev, u32 state)
+{
+ unsigned long flags;
+
+ if (!apic_pm_state.active)
+ return 0;
+
+ apic_pm_state.apic_id = apic_read(APIC_ID);
+ apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
+ apic_pm_state.apic_ldr = apic_read(APIC_LDR);
+ apic_pm_state.apic_dfr = apic_read(APIC_DFR);
+ apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
+ apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
+ apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+ apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
+ apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
+ apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
+ apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
+ apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
+ apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
+ local_save_flags(flags);
+ local_irq_disable();
+ disable_local_APIC();
+ local_irq_restore(flags);
+ return 0;
+}
+
+static int lapic_resume(struct sys_device *dev)
+{
+ unsigned int l, h;
+ unsigned long flags;
+
+ if (!apic_pm_state.active)
+ return 0;
+
+ /* XXX: Pavel needs this for S3 resume, but can't explain why */
+ set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
+
+ local_irq_save(flags);
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+ apic_write(APIC_ID, apic_pm_state.apic_id);
+ apic_write(APIC_DFR, apic_pm_state.apic_dfr);
+ apic_write(APIC_LDR, apic_pm_state.apic_ldr);
+ apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
+ apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
+ apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
+ apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
+ apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+ apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
+ apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
+ apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
+ apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ local_irq_restore(flags);
+ return 0;
+}
+
+static struct sysdev_class lapic_sysclass = {
+ set_kset_name("lapic"),
+ .resume = lapic_resume,
+ .suspend = lapic_suspend,
+};
+
+static struct sys_device device_lapic = {
+ .id = 0,
+ .cls = &lapic_sysclass,
+};
+
+static void __init apic_pm_activate(void)
+{
+ apic_pm_state.active = 1;
+}
+
+static int __init init_lapic_sysfs(void)
+{
+ int error;
+ if (!cpu_has_apic)
+ return 0;
+ /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
+ error = sysdev_class_register(&lapic_sysclass);
+ if (!error)
+ error = sysdev_register(&device_lapic);
+ return error;
+}
+device_initcall(init_lapic_sysfs);
+
+#else /* CONFIG_PM */
+
+static void apic_pm_activate(void) { }
+
+#endif /* CONFIG_PM */
+
+static int __init apic_set_verbosity(char *str)
+{
+ if (strcmp("debug", str) == 0)
+ apic_verbosity = APIC_DEBUG;
+ else if (strcmp("verbose", str) == 0)
+ apic_verbosity = APIC_VERBOSE;
+ else
+ printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+ " use apic=verbose or apic=debug", str);
+
+ return 0;
+}
+
+__setup("apic=", apic_set_verbosity);
+
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+
+static int __init detect_init_APIC (void)
+{
+ if (!cpu_has_apic) {
+ printk(KERN_INFO "No local APIC present\n");
+ return -1;
+ }
+
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ boot_cpu_id = 0;
+ return 0;
+}
+
+void __init init_apic_mappings(void)
+{
+ unsigned long apic_phys;
+
+ /*
+ * If no local APIC can be found then set up a fake all
+ * zeroes page to simulate the local APIC and another
+ * one for the IO-APIC.
+ */
+ if (!smp_found_config && detect_init_APIC()) {
+ apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+ apic_phys = __pa(apic_phys);
+ } else
+ apic_phys = mp_lapic_addr;
+
+ set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+ apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+
+ /*
+ * Fetch the APIC ID of the BSP in case we have a
+ * default configuration (or the MP table is broken).
+ */
+ if (boot_cpu_id == -1U)
+ boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+
+#ifdef CONFIG_X86_IO_APIC
+ {
+ unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
+ int i;
+
+ for (i = 0; i < nr_ioapics; i++) {
+ if (smp_found_config) {
+ ioapic_phys = mp_ioapics[i].mpc_apicaddr;
+ } else {
+ ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
+ ioapic_phys = __pa(ioapic_phys);
+ }
+ set_fixmap_nocache(idx, ioapic_phys);
+ apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
+ __fix_to_virt(idx), ioapic_phys);
+ idx++;
+ }
+ }
+#endif
+}
+
+/*
+ * This function sets up the local APIC timer, with a timeout of
+ * 'clocks' APIC bus clock. During calibration we actually call
+ * this function twice on the boot CPU, once with a bogus timeout
+ * value, second time for real. The other (noncalibrating) CPUs
+ * call this function only once, with the real, calibrated value.
+ *
+ * We do reads before writes even if unnecessary, to get around the
+ * P5 APIC double write bug.
+ */
+
+#define APIC_DIVISOR 16
+
+static void __setup_APIC_LVTT(unsigned int clocks)
+{
+ unsigned int lvtt_value, tmp_value, ver;
+
+ ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+ lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+ if (!APIC_INTEGRATED(ver))
+ lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
+ apic_write_around(APIC_LVTT, lvtt_value);
+
+ /*
+ * Divide PICLK by 16
+ */
+ tmp_value = apic_read(APIC_TDCR);
+ apic_write_around(APIC_TDCR, (tmp_value
+ & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
+ | APIC_TDR_DIV_16);
+
+ apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+}
+
+static void setup_APIC_timer(unsigned int clocks)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ /* For some reasons this doesn't work on Simics, so fake it for now */
+ if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) {
+ __setup_APIC_LVTT(clocks);
+ return;
+ }
+
+ /* wait for irq slice */
+ if (vxtime.hpet_address) {
+ int trigger = hpet_readl(HPET_T0_CMP);
+ while (hpet_readl(HPET_COUNTER) >= trigger)
+ /* do nothing */ ;
+ while (hpet_readl(HPET_COUNTER) < trigger)
+ /* do nothing */ ;
+ } else {
+ int c1, c2;
+ outb_p(0x00, 0x43);
+ c2 = inb_p(0x40);
+ c2 |= inb_p(0x40) << 8;
+ do {
+ c1 = c2;
+ outb_p(0x00, 0x43);
+ c2 = inb_p(0x40);
+ c2 |= inb_p(0x40) << 8;
+ } while (c2 - c1 < 300);
+ }
+
+ __setup_APIC_LVTT(clocks);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * In this function we calibrate APIC bus clocks to the external
+ * timer. Unfortunately we cannot use jiffies and the timer irq
+ * to calibrate, since some later bootup code depends on getting
+ * the first irq? Ugh.
+ *
+ * We want to do the calibration only once since we
+ * want to have local timer irqs syncron. CPUs connected
+ * by the same APIC bus have the very same bus frequency.
+ * And we want to have irqs off anyways, no accidental
+ * APIC irq that way.
+ */
+
+#define TICK_COUNT 100000000
+
+static int __init calibrate_APIC_clock(void)
+{
+ int apic, apic_start, tsc, tsc_start;
+ int result;
+ /*
+ * Put whatever arbitrary (but long enough) timeout
+ * value into the APIC clock, we just want to get the
+ * counter running for calibration.
+ */
+ __setup_APIC_LVTT(1000000000);
+
+ apic_start = apic_read(APIC_TMCCT);
+ rdtscl(tsc_start);
+
+ do {
+ apic = apic_read(APIC_TMCCT);
+ rdtscl(tsc);
+ } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT);
+
+ result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start);
+
+ printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
+ result / 1000 / 1000, result / 1000 % 1000);
+
+ return result * APIC_DIVISOR / HZ;
+}
+
+static unsigned int calibration_result;
+
+void __init setup_boot_APIC_clock (void)
+{
+ if (disable_apic_timer) {
+ printk(KERN_INFO "Disabling APIC timer\n");
+ return;
+ }
+
+ printk(KERN_INFO "Using local APIC timer interrupts.\n");
+ using_apic_timer = 1;
+
+ local_irq_disable();
+
+ calibration_result = calibrate_APIC_clock();
+ /*
+ * Now set up the timer for real.
+ */
+ setup_APIC_timer(calibration_result);
+
+ local_irq_enable();
+}
+
+void __init setup_secondary_APIC_clock(void)
+{
+ local_irq_disable(); /* FIXME: Do we need this? --RR */
+ setup_APIC_timer(calibration_result);
+ local_irq_enable();
+}
+
+void __init disable_APIC_timer(void)
+{
+ if (using_apic_timer) {
+ unsigned long v;
+
+ v = apic_read(APIC_LVTT);
+ apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+ }
+}
+
+void enable_APIC_timer(void)
+{
+ if (using_apic_timer) {
+ unsigned long v;
+
+ v = apic_read(APIC_LVTT);
+ apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
+ }
+}
+
+/*
+ * the frequency of the profiling timer can be changed
+ * by writing a multiplier value into /proc/profile.
+ */
+int setup_profiling_timer(unsigned int multiplier)
+{
+ int i;
+
+ /*
+ * Sanity check. [at least 500 APIC cycles should be
+ * between APIC interrupts as a rule of thumb, to avoid
+ * irqs flooding us]
+ */
+ if ( (!multiplier) || (calibration_result/multiplier < 500))
+ return -EINVAL;
+
+ /*
+ * Set the new multiplier for each CPU. CPUs don't start using the
+ * new values until the next timer interrupt in which they do process
+ * accounting. At that time they also adjust their APIC timers
+ * accordingly.
+ */
+ for (i = 0; i < NR_CPUS; ++i)
+ per_cpu(prof_multiplier, i) = multiplier;
+
+ return 0;
+}
+
+#undef APIC_DIVISOR
+
+/*
+ * Local timer interrupt handler. It does both profiling and
+ * process statistics/rescheduling.
+ *
+ * We do profiling in every local tick, statistics/rescheduling
+ * happen only every 'profiling multiplier' ticks. The default
+ * multiplier is 1 and it can be changed by writing the new multiplier
+ * value into /proc/profile.
+ */
+
+void smp_local_timer_interrupt(struct pt_regs *regs)
+{
+ int cpu = smp_processor_id();
+
+ profile_tick(CPU_PROFILING, regs);
+ if (--per_cpu(prof_counter, cpu) <= 0) {
+ /*
+ * The multiplier may have changed since the last time we got
+ * to this point as a result of the user writing to
+ * /proc/profile. In this case we need to adjust the APIC
+ * timer accordingly.
+ *
+ * Interrupts are already masked off at this point.
+ */
+ per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
+ if (per_cpu(prof_counter, cpu) !=
+ per_cpu(prof_old_multiplier, cpu)) {
+ __setup_APIC_LVTT(calibration_result/
+ per_cpu(prof_counter, cpu));
+ per_cpu(prof_old_multiplier, cpu) =
+ per_cpu(prof_counter, cpu);
+ }
+
+#ifdef CONFIG_SMP
+ update_process_times(user_mode(regs));
+#endif
+ }
+
+ /*
+ * We take the 'long' return path, and there every subsystem
+ * grabs the appropriate locks (kernel lock/ irq lock).
+ *
+ * we might want to decouple profiling from the 'long path',
+ * and do the profiling totally in assembly.
+ *
+ * Currently this isn't too much of an issue (performance wise),
+ * we can take more than 100K local irqs per second on a 100 MHz P5.
+ */
+}
+
+/*
+ * Local APIC timer interrupt. This is the most natural way for doing
+ * local interrupts, but local timer interrupts can be emulated by
+ * broadcast interrupts too. [in case the hw doesn't support APIC timers]
+ *
+ * [ if a single-CPU system runs an SMP kernel then we call the local
+ * interrupt as well. Thus we cannot inline the local irq ... ]
+ */
+void smp_apic_timer_interrupt(struct pt_regs *regs)
+{
+ /*
+ * the NMI deadlock-detector uses this.
+ */
+ add_pda(apic_timer_irqs, 1);
+
+ /*
+ * NOTE! We'd better ACK the irq immediately,
+ * because timer handling can be slow.
+ */
+ ack_APIC_irq();
+ /*
+ * update_process_times() expects us to have done irq_enter().
+ * Besides, if we don't timer interrupts ignore the global
+ * interrupt lock, which is the WrongThing (tm) to do.
+ */
+ irq_enter();
+ smp_local_timer_interrupt(regs);
+ irq_exit();
+}
+
+/*
+ * oem_force_hpet_timer -- force HPET mode for some boxes.
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
+ */
+__init int oem_force_hpet_timer(void)
+{
+ int i, clusters, zeros;
+ unsigned id;
+ DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+ bitmap_empty(clustermap, NUM_APIC_CLUSTERS);
+
+ for (i = 0; i < NR_CPUS; i++) {
+ id = bios_cpu_apicid[i];
+ if (id != BAD_APICID)
+ __set_bit(APIC_CLUSTERID(id), clustermap);
+ }
+
+ /* Problem: Partially populated chassis may not have CPUs in some of
+ * the APIC clusters they have been allocated. Only present CPUs have
+ * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since
+ * clusters are allocated sequentially, count zeros only if they are
+ * bounded by ones.
+ */
+ clusters = 0;
+ zeros = 0;
+ for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+ if (test_bit(i, clustermap)) {
+ clusters += 1 + zeros;
+ zeros = 0;
+ } else
+ ++zeros;
+ }
+
+ /*
+ * If clusters > 2, then should be multi-chassis. Return 1 for HPET.
+ * Else return 0 to use TSC.
+ * May have to revisit this when multi-core + hyperthreaded CPUs come
+ * out, but AFAIK this will work even for them.
+ */
+ return (clusters > 2);
+}
+
+/*
+ * This interrupt should _never_ happen with our APIC/SMP architecture
+ */
+asmlinkage void smp_spurious_interrupt(void)
+{
+ unsigned int v;
+ irq_enter();
+ /*
+ * Check if this really is a spurious interrupt and ACK it
+ * if it is a vectored one. Just in case...
+ * Spurious interrupts should not be ACKed.
+ */
+ v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
+ if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+ ack_APIC_irq();
+
+#if 0
+ static unsigned long last_warning;
+ static unsigned long skipped;
+
+ /* see sw-dev-man vol 3, chapter 7.4.13.5 */
+ if (time_before(last_warning+30*HZ,jiffies)) {
+ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n",
+ smp_processor_id(), skipped);
+ last_warning = jiffies;
+ skipped = 0;
+ } else {
+ skipped++;
+ }
+#endif
+ irq_exit();
+}
+
+/*
+ * This interrupt should never happen with our APIC/SMP architecture
+ */
+
+asmlinkage void smp_error_interrupt(void)
+{
+ unsigned int v, v1;
+
+ irq_enter();
+ /* First tickle the hardware, only then report what went on. -- REW */
+ v = apic_read(APIC_ESR);
+ apic_write(APIC_ESR, 0);
+ v1 = apic_read(APIC_ESR);
+ ack_APIC_irq();
+ atomic_inc(&irq_err_count);
+
+ /* Here is what the APIC error bits mean:
+ 0: Send CS error
+ 1: Receive CS error
+ 2: Send accept error
+ 3: Receive accept error
+ 4: Reserved
+ 5: Send illegal vector
+ 6: Received illegal vector
+ 7: Illegal register address
+ */
+ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
+ smp_processor_id(), v , v1);
+ irq_exit();
+}
+
+int disable_apic;
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+ if (disable_apic) {
+ printk(KERN_INFO "Apic disabled\n");
+ return -1;
+ }
+ if (!cpu_has_apic) {
+ disable_apic = 1;
+ printk(KERN_INFO "Apic disabled by BIOS\n");
+ return -1;
+ }
+
+ verify_local_APIC();
+
+ connect_bsp_APIC();
+
+ phys_cpu_present_map = physid_mask_of_physid(0);
+ apic_write_around(APIC_ID, boot_cpu_id);
+
+ setup_local_APIC();
+
+#ifdef CONFIG_X86_IO_APIC
+ if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+ setup_IO_APIC();
+ else
+ nr_ioapics = 0;
+#endif
+ setup_boot_APIC_clock();
+
+ return 0;
+}
+
+static __init int setup_disableapic(char *str)
+{
+ disable_apic = 1;
+ return 0;
+}
+
+static __init int setup_nolapic(char *str)
+{
+ disable_apic = 1;
+ return 0;
+}
+
+static __init int setup_noapictimer(char *str)
+{
+ disable_apic_timer = 1;
+ return 0;
+}
+
+/* dummy parsing: see setup.c */
+
+__setup("disableapic", setup_disableapic);
+__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
+
+__setup("noapictimer", setup_noapictimer);
+
+/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
new file mode 100644
index 000000000000..35b4c3fcbb37
--- /dev/null
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -0,0 +1,69 @@
+/*
+ * Generate definitions needed by assembly language modules.
+ * This code generates raw asm output which is post-processed to extract
+ * and format the required data.
+ */
+
+#include <linux/sched.h>
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/hardirq.h>
+#include <linux/suspend.h>
+#include <asm/pda.h>
+#include <asm/processor.h>
+#include <asm/segment.h>
+#include <asm/thread_info.h>
+#include <asm/ia32.h>
+
+#define DEFINE(sym, val) \
+ asm volatile("\n->" #sym " %0 " #val : : "i" (val))
+
+#define BLANK() asm volatile("\n->" : : )
+
+int main(void)
+{
+#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
+ ENTRY(state);
+ ENTRY(flags);
+ ENTRY(thread);
+ ENTRY(pid);
+ BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry))
+ ENTRY(flags);
+ ENTRY(addr_limit);
+ ENTRY(preempt_count);
+ BLANK();
+#undef ENTRY
+#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
+ ENTRY(kernelstack);
+ ENTRY(oldrsp);
+ ENTRY(pcurrent);
+ ENTRY(irqrsp);
+ ENTRY(irqcount);
+ ENTRY(cpunumber);
+ ENTRY(irqstackptr);
+ BLANK();
+#undef ENTRY
+#ifdef CONFIG_IA32_EMULATION
+#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry))
+ ENTRY(eax);
+ ENTRY(ebx);
+ ENTRY(ecx);
+ ENTRY(edx);
+ ENTRY(esi);
+ ENTRY(edi);
+ ENTRY(ebp);
+ ENTRY(esp);
+ ENTRY(eip);
+ BLANK();
+#undef ENTRY
+ DEFINE(IA32_RT_SIGFRAME_sigcontext,
+ offsetof (struct rt_sigframe32, uc.uc_mcontext));
+ BLANK();
+#endif
+ DEFINE(pbe_address, offsetof(struct pbe, address));
+ DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
+ DEFINE(pbe_next, offsetof(struct pbe, next));
+ return 0;
+}
diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig
new file mode 100644
index 000000000000..81f1562e5393
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Kconfig
@@ -0,0 +1,96 @@
+#
+# CPU Frequency scaling
+#
+
+menu "CPU Frequency scaling"
+
+source "drivers/cpufreq/Kconfig"
+
+if CPU_FREQ
+
+comment "CPUFreq processor drivers"
+
+config X86_POWERNOW_K8
+ tristate "AMD Opteron/Athlon64 PowerNow!"
+ select CPU_FREQ_TABLE
+ help
+ This adds the CPUFreq driver for mobile AMD Opteron/Athlon64 processors.
+
+ For details, take a look at <file:Documentation/cpu-freq/>.
+
+ If in doubt, say N.
+
+config X86_POWERNOW_K8_ACPI
+ bool
+ depends on X86_POWERNOW_K8 && ACPI_PROCESSOR
+ depends on !(X86_POWERNOW_K8 = y && ACPI_PROCESSOR = m)
+ default y
+
+config X86_SPEEDSTEP_CENTRINO
+ tristate "Intel Enhanced SpeedStep"
+ select CPU_FREQ_TABLE
+ depends on ACPI_PROCESSOR
+ help
+ This adds the CPUFreq driver for Enhanced SpeedStep enabled
+ mobile CPUs. This means Intel Pentium M (Centrino) CPUs
+ or 64bit enabled Intel Xeons.
+
+ For details, take a look at <file:Documentation/cpu-freq/>.
+
+ If in doubt, say N.
+
+config X86_SPEEDSTEP_CENTRINO_ACPI
+ bool
+ depends on X86_SPEEDSTEP_CENTRINO
+ default y
+
+config X86_ACPI_CPUFREQ
+ tristate "ACPI Processor P-States driver"
+ depends on ACPI_PROCESSOR
+ help
+ This driver adds a CPUFreq driver which utilizes the ACPI
+ Processor Performance States.
+
+ For details, take a look at <file:Documentation/cpu-freq/>.
+
+ If in doubt, say N.
+
+comment "shared options"
+
+config X86_ACPI_CPUFREQ_PROC_INTF
+ bool "/proc/acpi/processor/../performance interface (deprecated)"
+ depends on PROC_FS
+ depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI
+ help
+ This enables the deprecated /proc/acpi/processor/../performance
+ interface. While it is helpful for debugging, the generic,
+ cross-architecture cpufreq interfaces should be used.
+
+ If in doubt, say N.
+
+config X86_P4_CLOCKMOD
+ tristate "Intel Pentium 4 clock modulation"
+ depends on EMBEDDED
+ help
+ This adds the clock modulation driver for Intel Pentium 4 / XEON
+ processors. When enabled it will lower CPU temperature by skipping
+ clocks.
+
+ This driver should be only used in exceptional
+ circumstances when very low power is needed because it causes severe
+ slowdowns and noticeable latencies. Normally Speedstep should be used
+ instead.
+
+ For details, take a look at <file:Documentation/cpu-freq/>.
+
+ Unless you are absolutely sure say N.
+
+
+config X86_SPEEDSTEP_LIB
+ tristate
+ default X86_P4_CLOCKMOD
+
+endif
+
+endmenu
+
diff --git a/arch/x86_64/kernel/cpufreq/Makefile b/arch/x86_64/kernel/cpufreq/Makefile
new file mode 100644
index 000000000000..d8b593879224
--- /dev/null
+++ b/arch/x86_64/kernel/cpufreq/Makefile
@@ -0,0 +1,17 @@
+#
+# Reuse the i386 cpufreq drivers
+#
+
+SRCDIR := ../../../i386/kernel/cpu/cpufreq
+
+obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
+obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
+obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
+obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
+
+powernow-k8-objs := ${SRCDIR}/powernow-k8.o
+speedstep-centrino-objs := ${SRCDIR}/speedstep-centrino.o
+acpi-cpufreq-objs := ${SRCDIR}/acpi-cpufreq.o
+p4-clockmod-objs := ${SRCDIR}/p4-clockmod.o
+speedstep-lib-objs := ${SRCDIR}/speedstep-lib.o
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
new file mode 100644
index 000000000000..657003e461e6
--- /dev/null
+++ b/arch/x86_64/kernel/e820.c
@@ -0,0 +1,513 @@
+/*
+ * Handle the memory map.
+ * The functions here do the job until bootmem takes over.
+ * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <asm/page.h>
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/bootsetup.h>
+
+extern char _end[];
+
+/*
+ * PFN of last memory page.
+ */
+unsigned long end_pfn;
+
+/*
+ * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
+ * The direct mapping extends to end_pfn_map, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long end_pfn_map;
+
+/*
+ * Last pfn which the user wants to use.
+ */
+unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
+
+extern struct resource code_resource, data_resource;
+
+/* Check for some hardcoded bad areas that early boot is not allowed to touch */
+static inline int bad_addr(unsigned long *addrp, unsigned long size)
+{
+ unsigned long addr = *addrp, last = addr + size;
+
+ /* various gunk below that needed for SMP startup */
+ if (addr < 0x8000) {
+ *addrp = 0x8000;
+ return 1;
+ }
+
+ /* direct mapping tables of the kernel */
+ if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) {
+ *addrp = table_end << PAGE_SHIFT;
+ return 1;
+ }
+
+ /* initrd */
+#ifdef CONFIG_BLK_DEV_INITRD
+ if (LOADER_TYPE && INITRD_START && last >= INITRD_START &&
+ addr < INITRD_START+INITRD_SIZE) {
+ *addrp = INITRD_START + INITRD_SIZE;
+ return 1;
+ }
+#endif
+ /* kernel code + 640k memory hole (later should not be needed, but
+ be paranoid for now) */
+ if (last >= 640*1024 && addr < __pa_symbol(&_end)) {
+ *addrp = __pa_symbol(&_end);
+ return 1;
+ }
+ /* XXX ramdisk image here? */
+ return 0;
+}
+
+int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ if (type && ei->type != type)
+ continue;
+ if (ei->addr >= end || ei->addr + ei->size < start)
+ continue;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Find a free area in a specific range.
+ */
+unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long addr = ei->addr, last;
+ if (ei->type != E820_RAM)
+ continue;
+ if (addr < start)
+ addr = start;
+ if (addr > ei->addr + ei->size)
+ continue;
+ while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
+ ;
+ last = addr + size;
+ if (last > ei->addr + ei->size)
+ continue;
+ if (last > end)
+ continue;
+ return addr;
+ }
+ return -1UL;
+}
+
+/*
+ * Free bootmem based on the e820 table for a node.
+ */
+void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long last, addr;
+
+ if (ei->type != E820_RAM ||
+ ei->addr+ei->size <= start ||
+ ei->addr > end)
+ continue;
+
+ addr = round_up(ei->addr, PAGE_SIZE);
+ if (addr < start)
+ addr = start;
+
+ last = round_down(ei->addr + ei->size, PAGE_SIZE);
+ if (last >= end)
+ last = end;
+
+ if (last > addr && last-addr >= PAGE_SIZE)
+ free_bootmem_node(pgdat, addr, last-addr);
+ }
+}
+
+/*
+ * Find the highest page frame number we have available
+ */
+unsigned long __init e820_end_of_ram(void)
+{
+ int i;
+ unsigned long end_pfn = 0;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long start, end;
+
+ start = round_up(ei->addr, PAGE_SIZE);
+ end = round_down(ei->addr + ei->size, PAGE_SIZE);
+ if (start >= end)
+ continue;
+ if (ei->type == E820_RAM) {
+ if (end > end_pfn<<PAGE_SHIFT)
+ end_pfn = end>>PAGE_SHIFT;
+ } else {
+ if (end > end_pfn_map<<PAGE_SHIFT)
+ end_pfn_map = end>>PAGE_SHIFT;
+ }
+ }
+
+ if (end_pfn > end_pfn_map)
+ end_pfn_map = end_pfn;
+ if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
+ end_pfn_map = MAXMEM>>PAGE_SHIFT;
+ if (end_pfn > end_user_pfn)
+ end_pfn = end_user_pfn;
+ if (end_pfn > end_pfn_map)
+ end_pfn = end_pfn_map;
+
+ return end_pfn;
+}
+
+/*
+ * Mark e820 reserved areas as busy for the resource manager.
+ */
+void __init e820_reserve_resources(void)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct resource *res;
+ if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+ continue;
+ res = alloc_bootmem_low(sizeof(struct resource));
+ switch (e820.map[i].type) {
+ case E820_RAM: res->name = "System RAM"; break;
+ case E820_ACPI: res->name = "ACPI Tables"; break;
+ case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
+ default: res->name = "reserved";
+ }
+ res->start = e820.map[i].addr;
+ res->end = res->start + e820.map[i].size - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ request_resource(&iomem_resource, res);
+ if (e820.map[i].type == E820_RAM) {
+ /*
+ * We don't know which RAM region contains kernel data,
+ * so we try it repeatedly and let the resource manager
+ * test it.
+ */
+ request_resource(res, &code_resource);
+ request_resource(res, &data_resource);
+ }
+ }
+}
+
+/*
+ * Add a memory region to the kernel e820 map.
+ */
+void __init add_memory_region(unsigned long start, unsigned long size, int type)
+{
+ int x = e820.nr_map;
+
+ if (x == E820MAX) {
+ printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
+ return;
+ }
+
+ e820.map[x].addr = start;
+ e820.map[x].size = size;
+ e820.map[x].type = type;
+ e820.nr_map++;
+}
+
+void __init e820_print_map(char *who)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ printk(" %s: %016Lx - %016Lx ", who,
+ (unsigned long long) e820.map[i].addr,
+ (unsigned long long) (e820.map[i].addr + e820.map[i].size));
+ switch (e820.map[i].type) {
+ case E820_RAM: printk("(usable)\n");
+ break;
+ case E820_RESERVED:
+ printk("(reserved)\n");
+ break;
+ case E820_ACPI:
+ printk("(ACPI data)\n");
+ break;
+ case E820_NVS:
+ printk("(ACPI NVS)\n");
+ break;
+ default: printk("type %u\n", e820.map[i].type);
+ break;
+ }
+ }
+}
+
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries. The following
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+ struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+ };
+ static struct change_member change_point_list[2*E820MAX] __initdata;
+ static struct change_member *change_point[2*E820MAX] __initdata;
+ static struct e820entry *overlap_list[E820MAX] __initdata;
+ static struct e820entry new_bios[E820MAX] __initdata;
+ struct change_member *change_tmp;
+ unsigned long current_type, last_type;
+ unsigned long long last_addr;
+ int chgidx, still_changing;
+ int overlap_entries;
+ int new_bios_entry;
+ int old_nr, new_nr;
+ int i;
+
+ /*
+ Visually we're performing the following (1,2,3,4 = memory types)...
+
+ Sample memory map (w/overlaps):
+ ____22__________________
+ ______________________4_
+ ____1111________________
+ _44_____________________
+ 11111111________________
+ ____________________33__
+ ___________44___________
+ __________33333_________
+ ______________22________
+ ___________________2222_
+ _________111111111______
+ _____________________11_
+ _________________4______
+
+ Sanitized equivalent (no overlap):
+ 1_______________________
+ _44_____________________
+ ___1____________________
+ ____22__________________
+ ______11________________
+ _________1______________
+ __________3_____________
+ ___________44___________
+ _____________33_________
+ _______________2________
+ ________________1_______
+ _________________4______
+ ___________________2____
+ ____________________33__
+ ______________________4_
+ */
+
+ /* if there's only one memory region, don't bother */
+ if (*pnr_map < 2)
+ return -1;
+
+ old_nr = *pnr_map;
+
+ /* bail out if we find any unreasonable addresses in bios map */
+ for (i=0; i<old_nr; i++)
+ if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+ return -1;
+
+ /* create pointers for initial change-point information (for sorting) */
+ for (i=0; i < 2*old_nr; i++)
+ change_point[i] = &change_point_list[i];
+
+ /* record all known change-points (starting and ending addresses) */
+ chgidx = 0;
+ for (i=0; i < old_nr; i++) {
+ change_point[chgidx]->addr = biosmap[i].addr;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
+ change_point[chgidx++]->pbios = &biosmap[i];
+ }
+
+ /* sort change-point list by memory addresses (low -> high) */
+ still_changing = 1;
+ while (still_changing) {
+ still_changing = 0;
+ for (i=1; i < 2*old_nr; i++) {
+ /* if <current_addr> > <last_addr>, swap */
+ /* or, if current=<start_addr> & last=<end_addr>, swap */
+ if ((change_point[i]->addr < change_point[i-1]->addr) ||
+ ((change_point[i]->addr == change_point[i-1]->addr) &&
+ (change_point[i]->addr == change_point[i]->pbios->addr) &&
+ (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
+ )
+ {
+ change_tmp = change_point[i];
+ change_point[i] = change_point[i-1];
+ change_point[i-1] = change_tmp;
+ still_changing=1;
+ }
+ }
+ }
+
+ /* create a new bios memory map, removing overlaps */
+ overlap_entries=0; /* number of entries in the overlap table */
+ new_bios_entry=0; /* index for creating new bios map entries */
+ last_type = 0; /* start with undefined memory type */
+ last_addr = 0; /* start with 0 as last starting address */
+ /* loop through change-points, determining affect on the new bios map */
+ for (chgidx=0; chgidx < 2*old_nr; chgidx++)
+ {
+ /* keep track of all overlapping bios entries */
+ if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
+ {
+ /* add map entry to overlap list (> 1 entry implies an overlap) */
+ overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+ }
+ else
+ {
+ /* remove entry from list (order independent, so swap with last) */
+ for (i=0; i<overlap_entries; i++)
+ {
+ if (overlap_list[i] == change_point[chgidx]->pbios)
+ overlap_list[i] = overlap_list[overlap_entries-1];
+ }
+ overlap_entries--;
+ }
+ /* if there are overlapping entries, decide which "type" to use */
+ /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
+ current_type = 0;
+ for (i=0; i<overlap_entries; i++)
+ if (overlap_list[i]->type > current_type)
+ current_type = overlap_list[i]->type;
+ /* continue building up new bios map based on this information */
+ if (current_type != last_type) {
+ if (last_type != 0) {
+ new_bios[new_bios_entry].size =
+ change_point[chgidx]->addr - last_addr;
+ /* move forward only if the new size was non-zero */
+ if (new_bios[new_bios_entry].size != 0)
+ if (++new_bios_entry >= E820MAX)
+ break; /* no more space left for new bios entries */
+ }
+ if (current_type != 0) {
+ new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
+ new_bios[new_bios_entry].type = current_type;
+ last_addr=change_point[chgidx]->addr;
+ }
+ last_type = current_type;
+ }
+ }
+ new_nr = new_bios_entry; /* retain count for new bios entries */
+
+ /* copy new bios mapping into original location */
+ memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+ *pnr_map = new_nr;
+
+ return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory. If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up. (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+ /* Only one memory region (or negative)? Ignore it */
+ if (nr_map < 2)
+ return -1;
+
+ do {
+ unsigned long start = biosmap->addr;
+ unsigned long size = biosmap->size;
+ unsigned long end = start + size;
+ unsigned long type = biosmap->type;
+
+ /* Overflow in 64 bits? Ignore the memory map. */
+ if (start > end)
+ return -1;
+
+ /*
+ * Some BIOSes claim RAM in the 640k - 1M region.
+ * Not right. Fix it up.
+ *
+ * This should be removed on Hammer which is supposed to not
+ * have non e820 covered ISA mappings there, but I had some strange
+ * problems so it stays for now. -AK
+ */
+ if (type == E820_RAM) {
+ if (start < 0x100000ULL && end > 0xA0000ULL) {
+ if (start < 0xA0000ULL)
+ add_memory_region(start, 0xA0000ULL-start, type);
+ if (end <= 0x100000ULL)
+ continue;
+ start = 0x100000ULL;
+ size = end - start;
+ }
+ }
+
+ add_memory_region(start, size, type);
+ } while (biosmap++,--nr_map);
+ return 0;
+}
+
+void __init setup_memory_region(void)
+{
+ char *who = "BIOS-e820";
+
+ /*
+ * Try to copy the BIOS-supplied E820-map.
+ *
+ * Otherwise fake a memory map; one section from 0k->640k,
+ * the next section from 1mb->appropriate_mem_k
+ */
+ sanitize_e820_map(E820_MAP, &E820_MAP_NR);
+ if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
+ unsigned long mem_size;
+
+ /* compare results from other methods and take the greater */
+ if (ALT_MEM_K < EXT_MEM_K) {
+ mem_size = EXT_MEM_K;
+ who = "BIOS-88";
+ } else {
+ mem_size = ALT_MEM_K;
+ who = "BIOS-e801";
+ }
+
+ e820.nr_map = 0;
+ add_memory_region(0, LOWMEMSIZE(), E820_RAM);
+ add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
+ }
+ printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+ e820_print_map(who);
+}
+
+void __init parse_memopt(char *p, char **from)
+{
+ end_user_pfn = memparse(p, from);
+ end_user_pfn >>= PAGE_SHIFT;
+}
+
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
new file mode 100644
index 000000000000..750bcd0655dc
--- /dev/null
+++ b/arch/x86_64/kernel/early_printk.c
@@ -0,0 +1,220 @@
+#include <linux/console.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+/* Simple VGA output */
+
+#ifdef __i386__
+#define VGABASE (__ISA_IO_base + 0xb8000)
+#else
+#define VGABASE ((void __iomem *)0xffffffff800b8000UL)
+#endif
+
+#define MAX_YPOS 25
+#define MAX_XPOS 80
+
+static int current_ypos = 1, current_xpos = 0;
+
+static void early_vga_write(struct console *con, const char *str, unsigned n)
+{
+ char c;
+ int i, k, j;
+
+ while ((c = *str++) != '\0' && n-- > 0) {
+ if (current_ypos >= MAX_YPOS) {
+ /* scroll 1 line up */
+ for (k = 1, j = 0; k < MAX_YPOS; k++, j++) {
+ for (i = 0; i < MAX_XPOS; i++) {
+ writew(readw(VGABASE + 2*(MAX_XPOS*k + i)),
+ VGABASE + 2*(MAX_XPOS*j + i));
+ }
+ }
+ for (i = 0; i < MAX_XPOS; i++)
+ writew(0x720, VGABASE + 2*(MAX_XPOS*j + i));
+ current_ypos = MAX_YPOS-1;
+ }
+ if (c == '\n') {
+ current_xpos = 0;
+ current_ypos++;
+ } else if (c != '\r') {
+ writew(((0x7 << 8) | (unsigned short) c),
+ VGABASE + 2*(MAX_XPOS*current_ypos +
+ current_xpos++));
+ if (current_xpos >= MAX_XPOS) {
+ current_xpos = 0;
+ current_ypos++;
+ }
+ }
+ }
+}
+
+static struct console early_vga_console = {
+ .name = "earlyvga",
+ .write = early_vga_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
+
+int early_serial_base = 0x3f8; /* ttyS0 */
+
+#define XMTRDY 0x20
+
+#define DLAB 0x80
+
+#define TXR 0 /* Transmit register (WRITE) */
+#define RXR 0 /* Receive register (READ) */
+#define IER 1 /* Interrupt Enable */
+#define IIR 2 /* Interrupt ID */
+#define FCR 2 /* FIFO control */
+#define LCR 3 /* Line control */
+#define MCR 4 /* Modem control */
+#define LSR 5 /* Line Status */
+#define MSR 6 /* Modem Status */
+#define DLL 0 /* Divisor Latch Low */
+#define DLH 1 /* Divisor latch High */
+
+static int early_serial_putc(unsigned char ch)
+{
+ unsigned timeout = 0xffff;
+ while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ cpu_relax();
+ outb(ch, early_serial_base + TXR);
+ return timeout ? 0 : -1;
+}
+
+static void early_serial_write(struct console *con, const char *s, unsigned n)
+{
+ while (*s && n-- > 0) {
+ early_serial_putc(*s);
+ if (*s == '\n')
+ early_serial_putc('\r');
+ s++;
+ }
+}
+
+#define DEFAULT_BAUD 9600
+
+static __init void early_serial_init(char *s)
+{
+ unsigned char c;
+ unsigned divisor;
+ unsigned baud = DEFAULT_BAUD;
+ char *e;
+
+ if (*s == ',')
+ ++s;
+
+ if (*s) {
+ unsigned port;
+ if (!strncmp(s,"0x",2)) {
+ early_serial_base = simple_strtoul(s, &e, 16);
+ } else {
+ static int bases[] = { 0x3f8, 0x2f8 };
+
+ if (!strncmp(s,"ttyS",4))
+ s += 4;
+ port = simple_strtoul(s, &e, 10);
+ if (port > 1 || s == e)
+ port = 0;
+ early_serial_base = bases[port];
+ }
+ s += strcspn(s, ",");
+ if (*s == ',')
+ s++;
+ }
+
+ outb(0x3, early_serial_base + LCR); /* 8n1 */
+ outb(0, early_serial_base + IER); /* no interrupt */
+ outb(0, early_serial_base + FCR); /* no fifo */
+ outb(0x3, early_serial_base + MCR); /* DTR + RTS */
+
+ if (*s) {
+ baud = simple_strtoul(s, &e, 0);
+ if (baud == 0 || s == e)
+ baud = DEFAULT_BAUD;
+ }
+
+ divisor = 115200 / baud;
+ c = inb(early_serial_base + LCR);
+ outb(c | DLAB, early_serial_base + LCR);
+ outb(divisor & 0xff, early_serial_base + DLL);
+ outb((divisor >> 8) & 0xff, early_serial_base + DLH);
+ outb(c & ~DLAB, early_serial_base + LCR);
+}
+
+static struct console early_serial_console = {
+ .name = "earlyser",
+ .write = early_serial_write,
+ .flags = CON_PRINTBUFFER,
+ .index = -1,
+};
+
+/* Direct interface for emergencies */
+struct console *early_console = &early_vga_console;
+static int early_console_initialized = 0;
+
+void early_printk(const char *fmt, ...)
+{
+ char buf[512];
+ int n;
+ va_list ap;
+
+ va_start(ap,fmt);
+ n = vscnprintf(buf,512,fmt,ap);
+ early_console->write(early_console,buf,n);
+ va_end(ap);
+}
+
+static int keep_early;
+
+int __init setup_early_printk(char *opt)
+{
+ char *space;
+ char buf[256];
+
+ if (early_console_initialized)
+ return -1;
+
+ opt = strchr(opt, '=') + 1;
+
+ strlcpy(buf,opt,sizeof(buf));
+ space = strchr(buf, ' ');
+ if (space)
+ *space = 0;
+
+ if (strstr(buf,"keep"))
+ keep_early = 1;
+
+ if (!strncmp(buf, "serial", 6)) {
+ early_serial_init(buf + 6);
+ early_console = &early_serial_console;
+ } else if (!strncmp(buf, "ttyS", 4)) {
+ early_serial_init(buf);
+ early_console = &early_serial_console;
+ } else if (!strncmp(buf, "vga", 3)) {
+ early_console = &early_vga_console;
+ }
+ early_console_initialized = 1;
+ register_console(early_console);
+ return 0;
+}
+
+void __init disable_early_printk(void)
+{
+ if (!early_console_initialized || !early_console)
+ return;
+ if (!keep_early) {
+ printk("disabling early console\n");
+ unregister_console(early_console);
+ early_console_initialized = 0;
+ } else {
+ printk("keeping early console\n");
+ }
+}
+
+__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
new file mode 100644
index 000000000000..e126284db7a8
--- /dev/null
+++ b/arch/x86_64/kernel/entry.S
@@ -0,0 +1,920 @@
+/*
+ * linux/arch/x86_64/entry.S
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ *
+ * $Id$
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after an interrupt and after each system call.
+ *
+ * Normal syscalls and interrupts don't save a full stack frame, this is
+ * only done for syscall tracing, signals or fork/exec et.al.
+ *
+ * A note on terminology:
+ * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
+ * - partial stack frame: partially saved registers upto R11.
+ * - full stack frame: Like partial stack frame, but all register saved.
+ *
+ * TODO:
+ * - schedule it carefully for the final hardware.
+ */
+
+#define ASSEMBLY 1
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/cache.h>
+#include <asm/errno.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
+#include <asm/offset.h>
+#include <asm/msr.h>
+#include <asm/unistd.h>
+#include <asm/thread_info.h>
+#include <asm/hw_irq.h>
+
+ .code64
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop cli
+#else
+#define preempt_stop
+#define retint_kernel retint_restore_args
+#endif
+
+/*
+ * C code is not supposed to know about undefined top of stack. Every time
+ * a C function with an pt_regs argument is called from the SYSCALL based
+ * fast path FIXUP_TOP_OF_STACK is needed.
+ * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
+ * manipulation.
+ */
+
+ /* %rsp:at FRAMEEND */
+ .macro FIXUP_TOP_OF_STACK tmp
+ movq %gs:pda_oldrsp,\tmp
+ movq \tmp,RSP(%rsp)
+ movq $__USER_DS,SS(%rsp)
+ movq $__USER_CS,CS(%rsp)
+ movq $-1,RCX(%rsp)
+ movq R11(%rsp),\tmp /* get eflags */
+ movq \tmp,EFLAGS(%rsp)
+ .endm
+
+ .macro RESTORE_TOP_OF_STACK tmp,offset=0
+ movq RSP-\offset(%rsp),\tmp
+ movq \tmp,%gs:pda_oldrsp
+ movq EFLAGS-\offset(%rsp),\tmp
+ movq \tmp,R11-\offset(%rsp)
+ .endm
+
+ .macro FAKE_STACK_FRAME child_rip
+ /* push in order ss, rsp, eflags, cs, rip */
+ xorq %rax, %rax
+ pushq %rax /* ss */
+ CFI_ADJUST_CFA_OFFSET 8
+ pushq %rax /* rsp */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_OFFSET rip,0
+ pushq $(1<<9) /* eflags - interrupts on */
+ CFI_ADJUST_CFA_OFFSET 8
+ pushq $__KERNEL_CS /* cs */
+ CFI_ADJUST_CFA_OFFSET 8
+ pushq \child_rip /* rip */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_OFFSET rip,0
+ pushq %rax /* orig rax */
+ CFI_ADJUST_CFA_OFFSET 8
+ .endm
+
+ .macro UNFAKE_STACK_FRAME
+ addq $8*6, %rsp
+ CFI_ADJUST_CFA_OFFSET -(6*8)
+ .endm
+
+ .macro CFI_DEFAULT_STACK
+ CFI_ADJUST_CFA_OFFSET (SS)
+ CFI_OFFSET r15,R15-SS
+ CFI_OFFSET r14,R14-SS
+ CFI_OFFSET r13,R13-SS
+ CFI_OFFSET r12,R12-SS
+ CFI_OFFSET rbp,RBP-SS
+ CFI_OFFSET rbx,RBX-SS
+ CFI_OFFSET r11,R11-SS
+ CFI_OFFSET r10,R10-SS
+ CFI_OFFSET r9,R9-SS
+ CFI_OFFSET r8,R8-SS
+ CFI_OFFSET rax,RAX-SS
+ CFI_OFFSET rcx,RCX-SS
+ CFI_OFFSET rdx,RDX-SS
+ CFI_OFFSET rsi,RSI-SS
+ CFI_OFFSET rdi,RDI-SS
+ CFI_OFFSET rsp,RSP-SS
+ CFI_OFFSET rip,RIP-SS
+ .endm
+/*
+ * A newly forked process directly context switches into this.
+ */
+/* rdi: prev */
+ENTRY(ret_from_fork)
+ CFI_STARTPROC
+ CFI_DEFAULT_STACK
+ call schedule_tail
+ GET_THREAD_INFO(%rcx)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+ jnz rff_trace
+rff_action:
+ RESTORE_REST
+ testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread?
+ je int_ret_from_sys_call
+ testl $_TIF_IA32,threadinfo_flags(%rcx)
+ jnz int_ret_from_sys_call
+ RESTORE_TOP_OF_STACK %rdi,ARGOFFSET
+ jmp ret_from_sys_call
+rff_trace:
+ movq %rsp,%rdi
+ call syscall_trace_leave
+ GET_THREAD_INFO(%rcx)
+ jmp rff_action
+ CFI_ENDPROC
+
+/*
+ * System call entry. Upto 6 arguments in registers are supported.
+ *
+ * SYSCALL does not save anything on the stack and does not change the
+ * stack pointer.
+ */
+
+/*
+ * Register setup:
+ * rax system call number
+ * rdi arg0
+ * rcx return address for syscall/sysret, C arg3
+ * rsi arg1
+ * rdx arg2
+ * r10 arg3 (--> moved to rcx for C)
+ * r8 arg4
+ * r9 arg5
+ * r11 eflags for syscall/sysret, temporary for C
+ * r12-r15,rbp,rbx saved by C code, not touched.
+ *
+ * Interrupts are off on entry.
+ * Only called from user space.
+ *
+ * XXX if we had a free scratch register we could save the RSP into the stack frame
+ * and report it properly in ps. Unfortunately we haven't.
+ */
+
+ENTRY(system_call)
+ CFI_STARTPROC
+ swapgs
+ movq %rsp,%gs:pda_oldrsp
+ movq %gs:pda_kernelstack,%rsp
+ sti
+ SAVE_ARGS 8,1
+ movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ movq %rcx,RIP-ARGOFFSET(%rsp)
+ GET_THREAD_INFO(%rcx)
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx)
+ jnz tracesys
+ cmpq $__NR_syscall_max,%rax
+ ja badsys
+ movq %r10,%rcx
+ call *sys_call_table(,%rax,8) # XXX: rip relative
+ movq %rax,RAX-ARGOFFSET(%rsp)
+/*
+ * Syscall return path ending with SYSRET (fast path)
+ * Has incomplete stack frame and undefined top of stack.
+ */
+ .globl ret_from_sys_call
+ret_from_sys_call:
+ movl $_TIF_WORK_MASK,%edi
+ /* edi: flagmask */
+sysret_check:
+ GET_THREAD_INFO(%rcx)
+ cli
+ movl threadinfo_flags(%rcx),%edx
+ andl %edi,%edx
+ jnz sysret_careful
+ movq RIP-ARGOFFSET(%rsp),%rcx
+ RESTORE_ARGS 0,-ARG_SKIP,1
+ movq %gs:pda_oldrsp,%rsp
+ swapgs
+ sysretq
+
+ /* Handle reschedules */
+ /* edx: work, edi: workmask */
+sysret_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc sysret_signal
+ sti
+ pushq %rdi
+ call schedule
+ popq %rdi
+ jmp sysret_check
+
+ /* Handle a signal */
+sysret_signal:
+ sti
+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+ jz 1f
+
+ /* Really a signal */
+ /* edx: work flags (arg3) */
+ leaq do_notify_resume(%rip),%rax
+ leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
+ xorl %esi,%esi # oldset -> arg2
+ call ptregscall_common
+1: movl $_TIF_NEED_RESCHED,%edi
+ jmp sysret_check
+
+ /* Do syscall tracing */
+tracesys:
+ SAVE_REST
+ movq $-ENOSYS,RAX(%rsp)
+ FIXUP_TOP_OF_STACK %rdi
+ movq %rsp,%rdi
+ call syscall_trace_enter
+ LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
+ RESTORE_REST
+ cmpq $__NR_syscall_max,%rax
+ ja 1f
+ movq %r10,%rcx /* fixup for C */
+ call *sys_call_table(,%rax,8)
+ movq %rax,RAX-ARGOFFSET(%rsp)
+1: SAVE_REST
+ movq %rsp,%rdi
+ call syscall_trace_leave
+ RESTORE_TOP_OF_STACK %rbx
+ RESTORE_REST
+ jmp ret_from_sys_call
+
+badsys:
+ movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
+ jmp ret_from_sys_call
+
+/*
+ * Syscall return path ending with IRET.
+ * Has correct top of stack, but partial stack frame.
+ */
+ENTRY(int_ret_from_sys_call)
+ cli
+ testl $3,CS-ARGOFFSET(%rsp)
+ je retint_restore_args
+ movl $_TIF_ALLWORK_MASK,%edi
+ /* edi: mask to check */
+int_with_check:
+ GET_THREAD_INFO(%rcx)
+ movl threadinfo_flags(%rcx),%edx
+ andl %edi,%edx
+ jnz int_careful
+ jmp retint_swapgs
+
+ /* Either reschedule or signal or syscall exit tracking needed. */
+ /* First do a reschedule test. */
+ /* edx: work, edi: workmask */
+int_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc int_very_careful
+ sti
+ pushq %rdi
+ call schedule
+ popq %rdi
+ jmp int_with_check
+
+ /* handle signals and tracing -- both require a full stack frame */
+int_very_careful:
+ sti
+ SAVE_REST
+ /* Check for syscall exit trace */
+ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+ jz int_signal
+ pushq %rdi
+ leaq 8(%rsp),%rdi # &ptregs -> arg1
+ call syscall_trace_leave
+ popq %rdi
+ btr $TIF_SYSCALL_TRACE,%edi
+ btr $TIF_SYSCALL_AUDIT,%edi
+ btr $TIF_SINGLESTEP,%edi
+ jmp int_restore_rest
+
+int_signal:
+ testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
+ jz 1f
+ movq %rsp,%rdi # &ptregs -> arg1
+ xorl %esi,%esi # oldset -> arg2
+ call do_notify_resume
+1: movl $_TIF_NEED_RESCHED,%edi
+int_restore_rest:
+ RESTORE_REST
+ jmp int_with_check
+ CFI_ENDPROC
+
+/*
+ * Certain special system calls that need to save a complete full stack frame.
+ */
+
+ .macro PTREGSCALL label,func,arg
+ .globl \label
+\label:
+ leaq \func(%rip),%rax
+ leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
+ jmp ptregscall_common
+ .endm
+
+ PTREGSCALL stub_clone, sys_clone, %r8
+ PTREGSCALL stub_fork, sys_fork, %rdi
+ PTREGSCALL stub_vfork, sys_vfork, %rdi
+ PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx
+ PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
+ PTREGSCALL stub_iopl, sys_iopl, %rsi
+
+ENTRY(ptregscall_common)
+ CFI_STARTPROC
+ popq %r11
+ CFI_ADJUST_CFA_OFFSET -8
+ SAVE_REST
+ movq %r11, %r15
+ FIXUP_TOP_OF_STACK %r11
+ call *%rax
+ RESTORE_TOP_OF_STACK %r11
+ movq %r15, %r11
+ RESTORE_REST
+ pushq %r11
+ CFI_ADJUST_CFA_OFFSET 8
+ ret
+ CFI_ENDPROC
+
+ENTRY(stub_execve)
+ CFI_STARTPROC
+ popq %r11
+ CFI_ADJUST_CFA_OFFSET -8
+ SAVE_REST
+ movq %r11, %r15
+ FIXUP_TOP_OF_STACK %r11
+ call sys_execve
+ GET_THREAD_INFO(%rcx)
+ bt $TIF_IA32,threadinfo_flags(%rcx)
+ jc exec_32bit
+ RESTORE_TOP_OF_STACK %r11
+ movq %r15, %r11
+ RESTORE_REST
+ push %r11
+ ret
+
+exec_32bit:
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ movq %rax,RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+
+/*
+ * sigreturn is special because it needs to restore all registers on return.
+ * This cannot be done with SYSRET, so use the IRET return path instead.
+ */
+ENTRY(stub_rt_sigreturn)
+ CFI_STARTPROC
+ addq $8, %rsp
+ SAVE_REST
+ movq %rsp,%rdi
+ FIXUP_TOP_OF_STACK %r11
+ call sys_rt_sigreturn
+ movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+ RESTORE_REST
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+
+/*
+ * Interrupt entry/exit.
+ *
+ * Interrupt entry points save only callee clobbered registers in fast path.
+ *
+ * Entry runs with interrupts off.
+ */
+
+/* 0(%rsp): interrupt number */
+ .macro interrupt func
+ CFI_STARTPROC simple
+ CFI_DEF_CFA rsp,(SS-RDI)
+ CFI_REL_OFFSET rsp,(RSP-ORIG_RAX)
+ CFI_REL_OFFSET rip,(RIP-ORIG_RAX)
+ cld
+#ifdef CONFIG_DEBUG_INFO
+ SAVE_ALL
+ movq %rsp,%rdi
+ /*
+ * Setup a stack frame pointer. This allows gdb to trace
+ * back to the original stack.
+ */
+ movq %rsp,%rbp
+ CFI_DEF_CFA_REGISTER rbp
+#else
+ SAVE_ARGS
+ leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
+#endif
+ testl $3,CS(%rdi)
+ je 1f
+ swapgs
+1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count
+ movq %gs:pda_irqstackptr,%rax
+ cmoveq %rax,%rsp
+ pushq %rdi # save old stack
+ call \func
+ .endm
+
+ENTRY(common_interrupt)
+ interrupt do_IRQ
+ /* 0(%rsp): oldrsp-ARGOFFSET */
+ret_from_intr:
+ popq %rdi
+ cli
+ subl $1,%gs:pda_irqcount
+#ifdef CONFIG_DEBUG_INFO
+ movq RBP(%rdi),%rbp
+#endif
+ leaq ARGOFFSET(%rdi),%rsp
+exit_intr:
+ GET_THREAD_INFO(%rcx)
+ testl $3,CS-ARGOFFSET(%rsp)
+ je retint_kernel
+
+ /* Interrupt came from user space */
+ /*
+ * Has a correct top of stack, but a partial stack frame
+ * %rcx: thread info. Interrupts off.
+ */
+retint_with_reschedule:
+ movl $_TIF_WORK_MASK,%edi
+retint_check:
+ movl threadinfo_flags(%rcx),%edx
+ andl %edi,%edx
+ jnz retint_careful
+retint_swapgs:
+ cli
+ swapgs
+retint_restore_args:
+ cli
+ RESTORE_ARGS 0,8,0
+iret_label:
+ iretq
+
+ .section __ex_table,"a"
+ .quad iret_label,bad_iret
+ .previous
+ .section .fixup,"ax"
+ /* force a signal here? this matches i386 behaviour */
+ /* running with kernel gs */
+bad_iret:
+ movq $-9999,%rdi /* better code? */
+ jmp do_exit
+ .previous
+
+ /* edi: workmask, edx: work */
+retint_careful:
+ bt $TIF_NEED_RESCHED,%edx
+ jnc retint_signal
+ sti
+ pushq %rdi
+ call schedule
+ popq %rdi
+ GET_THREAD_INFO(%rcx)
+ cli
+ jmp retint_check
+
+retint_signal:
+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+ jz retint_swapgs
+ sti
+ SAVE_REST
+ movq $-1,ORIG_RAX(%rsp)
+ xorq %rsi,%rsi # oldset
+ movq %rsp,%rdi # &pt_regs
+ call do_notify_resume
+ RESTORE_REST
+ cli
+ movl $_TIF_NEED_RESCHED,%edi
+ GET_THREAD_INFO(%rcx)
+ jmp retint_check
+
+#ifdef CONFIG_PREEMPT
+ /* Returning to kernel space. Check if we need preemption */
+ /* rcx: threadinfo. interrupts off. */
+ .p2align
+retint_kernel:
+ cmpl $0,threadinfo_preempt_count(%rcx)
+ jnz retint_restore_args
+ bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
+ jnc retint_restore_args
+ bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
+ jnc retint_restore_args
+ call preempt_schedule_irq
+ jmp exit_intr
+#endif
+ CFI_ENDPROC
+
+/*
+ * APIC interrupts.
+ */
+ .macro apicinterrupt num,func
+ pushq $\num-256
+ interrupt \func
+ jmp ret_from_intr
+ CFI_ENDPROC
+ .endm
+
+ENTRY(thermal_interrupt)
+ apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+
+#ifdef CONFIG_SMP
+ENTRY(reschedule_interrupt)
+ apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
+
+ENTRY(invalidate_interrupt)
+ apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt
+
+ENTRY(call_function_interrupt)
+ apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ENTRY(apic_timer_interrupt)
+ apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
+
+ENTRY(error_interrupt)
+ apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
+
+ENTRY(spurious_interrupt)
+ apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
+#endif
+
+/*
+ * Exception entry points.
+ */
+ .macro zeroentry sym
+ pushq $0 /* push error code/oldrax */
+ pushq %rax /* push real oldrax to the rdi slot */
+ leaq \sym(%rip),%rax
+ jmp error_entry
+ .endm
+
+ .macro errorentry sym
+ pushq %rax
+ leaq \sym(%rip),%rax
+ jmp error_entry
+ .endm
+
+ /* error code is on the stack already */
+ /* handle NMI like exceptions that can happen everywhere */
+ .macro paranoidentry sym
+ SAVE_ALL
+ cld
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f
+ swapgs
+ xorl %ebx,%ebx
+1: movq %rsp,%rdi
+ movq ORIG_RAX(%rsp),%rsi
+ movq $-1,ORIG_RAX(%rsp)
+ call \sym
+ .endm
+
+/*
+ * Exception entry point. This expects an error code/orig_rax on the stack
+ * and the exception handler in %rax.
+ */
+ENTRY(error_entry)
+ CFI_STARTPROC simple
+ CFI_DEF_CFA rsp,(SS-RDI)
+ CFI_REL_OFFSET rsp,(RSP-RDI)
+ CFI_REL_OFFSET rip,(RIP-RDI)
+ /* rdi slot contains rax, oldrax contains error code */
+ cld
+ subq $14*8,%rsp
+ CFI_ADJUST_CFA_OFFSET (14*8)
+ movq %rsi,13*8(%rsp)
+ CFI_REL_OFFSET rsi,RSI
+ movq 14*8(%rsp),%rsi /* load rax from rdi slot */
+ movq %rdx,12*8(%rsp)
+ CFI_REL_OFFSET rdx,RDX
+ movq %rcx,11*8(%rsp)
+ CFI_REL_OFFSET rcx,RCX
+ movq %rsi,10*8(%rsp) /* store rax */
+ CFI_REL_OFFSET rax,RAX
+ movq %r8, 9*8(%rsp)
+ CFI_REL_OFFSET r8,R8
+ movq %r9, 8*8(%rsp)
+ CFI_REL_OFFSET r9,R9
+ movq %r10,7*8(%rsp)
+ CFI_REL_OFFSET r10,R10
+ movq %r11,6*8(%rsp)
+ CFI_REL_OFFSET r11,R11
+ movq %rbx,5*8(%rsp)
+ CFI_REL_OFFSET rbx,RBX
+ movq %rbp,4*8(%rsp)
+ CFI_REL_OFFSET rbp,RBP
+ movq %r12,3*8(%rsp)
+ CFI_REL_OFFSET r12,R12
+ movq %r13,2*8(%rsp)
+ CFI_REL_OFFSET r13,R13
+ movq %r14,1*8(%rsp)
+ CFI_REL_OFFSET r14,R14
+ movq %r15,(%rsp)
+ CFI_REL_OFFSET r15,R15
+ xorl %ebx,%ebx
+ testl $3,CS(%rsp)
+ je error_kernelspace
+error_swapgs:
+ swapgs
+error_sti:
+ movq %rdi,RDI(%rsp)
+ movq %rsp,%rdi
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp)
+ call *%rax
+ /* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+error_exit:
+ movl %ebx,%eax
+ RESTORE_REST
+ cli
+ GET_THREAD_INFO(%rcx)
+ testl %eax,%eax
+ jne retint_kernel
+ movl threadinfo_flags(%rcx),%edx
+ movl $_TIF_WORK_MASK,%edi
+ andl %edi,%edx
+ jnz retint_careful
+ swapgs
+ RESTORE_ARGS 0,8,0
+ iretq
+ CFI_ENDPROC
+
+error_kernelspace:
+ incl %ebx
+ /* There are two places in the kernel that can potentially fault with
+ usergs. Handle them here. The exception handlers after
+ iret run with kernel gs again, so don't set the user space flag.
+ B stepping K8s sometimes report an truncated RIP for IRET
+ exceptions returning to compat mode. Check for these here too. */
+ leaq iret_label(%rip),%rbp
+ cmpq %rbp,RIP(%rsp)
+ je error_swapgs
+ movl %ebp,%ebp /* zero extend */
+ cmpq %rbp,RIP(%rsp)
+ je error_swapgs
+ cmpq $gs_change,RIP(%rsp)
+ je error_swapgs
+ jmp error_sti
+
+ /* Reload gs selector with exception handling */
+ /* edi: new selector */
+ENTRY(load_gs_index)
+ pushf
+ cli
+ swapgs
+gs_change:
+ movl %edi,%gs
+2: mfence /* workaround */
+ swapgs
+ popf
+ ret
+
+ .section __ex_table,"a"
+ .align 8
+ .quad gs_change,bad_gs
+ .previous
+ .section .fixup,"ax"
+ /* running with kernelgs */
+bad_gs:
+ swapgs /* switch back to user gs */
+ xorl %eax,%eax
+ movl %eax,%gs
+ jmp 2b
+ .previous
+
+/*
+ * Create a kernel thread.
+ *
+ * C extern interface:
+ * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ *
+ * asm input arguments:
+ * rdi: fn, rsi: arg, rdx: flags
+ */
+ENTRY(kernel_thread)
+ CFI_STARTPROC
+ FAKE_STACK_FRAME $child_rip
+ SAVE_ALL
+
+ # rdi: flags, rsi: usp, rdx: will be &pt_regs
+ movq %rdx,%rdi
+ orq kernel_thread_flags(%rip),%rdi
+ movq $-1, %rsi
+ movq %rsp, %rdx
+
+ xorl %r8d,%r8d
+ xorl %r9d,%r9d
+
+ # clone now
+ call do_fork
+ movq %rax,RAX(%rsp)
+ xorl %edi,%edi
+
+ /*
+ * It isn't worth to check for reschedule here,
+ * so internally to the x86_64 port you can rely on kernel_thread()
+ * not to reschedule the child before returning, this avoids the need
+ * of hacks for example to fork off the per-CPU idle tasks.
+ * [Hopefully no generic code relies on the reschedule -AK]
+ */
+ RESTORE_ALL
+ UNFAKE_STACK_FRAME
+ ret
+ CFI_ENDPROC
+
+
+child_rip:
+ /*
+ * Here we are in the child and the registers are set as they were
+ * at kernel_thread() invocation in the parent.
+ */
+ movq %rdi, %rax
+ movq %rsi, %rdi
+ call *%rax
+ # exit
+ xorq %rdi, %rdi
+ call do_exit
+
+/*
+ * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
+ *
+ * C extern interface:
+ * extern long execve(char *name, char **argv, char **envp)
+ *
+ * asm input arguments:
+ * rdi: name, rsi: argv, rdx: envp
+ *
+ * We want to fallback into:
+ * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs)
+ *
+ * do_sys_execve asm fallback arguments:
+ * rdi: name, rsi: argv, rdx: envp, fake frame on the stack
+ */
+ENTRY(execve)
+ CFI_STARTPROC
+ FAKE_STACK_FRAME $0
+ SAVE_ALL
+ call sys_execve
+ movq %rax, RAX(%rsp)
+ RESTORE_REST
+ testq %rax,%rax
+ je int_ret_from_sys_call
+ RESTORE_ARGS
+ UNFAKE_STACK_FRAME
+ ret
+ CFI_ENDPROC
+
+ENTRY(page_fault)
+ errorentry do_page_fault
+
+ENTRY(coprocessor_error)
+ zeroentry do_coprocessor_error
+
+ENTRY(simd_coprocessor_error)
+ zeroentry do_simd_coprocessor_error
+
+ENTRY(device_not_available)
+ zeroentry math_state_restore
+
+ /* runs on exception stack */
+ENTRY(debug)
+ CFI_STARTPROC
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8
+ paranoidentry do_debug
+ /* switch back to process stack to restore the state ptrace touched */
+ movq %rax,%rsp
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace
+ jmp paranoid_exit
+ CFI_ENDPROC
+
+ /* runs on exception stack */
+ENTRY(nmi)
+ CFI_STARTPROC
+ pushq $-1
+ CFI_ADJUST_CFA_OFFSET 8
+ paranoidentry do_nmi
+ /* ebx: no swapgs flag */
+paranoid_exit:
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_restore
+paranoid_swapgs:
+ cli
+ swapgs
+paranoid_restore:
+ RESTORE_ALL 8
+ iretq
+paranoid_userspace:
+ cli
+ GET_THREAD_INFO(%rcx)
+ movl threadinfo_flags(%rcx),%edx
+ testl $_TIF_NEED_RESCHED,%edx
+ jnz paranoid_resched
+ testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+ jnz paranoid_signal
+ jmp paranoid_swapgs
+paranoid_resched:
+ sti
+ call schedule
+ jmp paranoid_exit
+paranoid_signal:
+ sti
+ xorl %esi,%esi /* oldset */
+ movq %rsp,%rdi /* &pt_regs */
+ call do_notify_resume
+ jmp paranoid_exit
+ CFI_ENDPROC
+
+ENTRY(int3)
+ zeroentry do_int3
+
+ENTRY(overflow)
+ zeroentry do_overflow
+
+ENTRY(bounds)
+ zeroentry do_bounds
+
+ENTRY(invalid_op)
+ zeroentry do_invalid_op
+
+ENTRY(coprocessor_segment_overrun)
+ zeroentry do_coprocessor_segment_overrun
+
+ENTRY(reserved)
+ zeroentry do_reserved
+
+ /* runs on exception stack */
+ENTRY(double_fault)
+ CFI_STARTPROC
+ paranoidentry do_double_fault
+ movq %rax,%rsp
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace
+ jmp paranoid_exit
+ CFI_ENDPROC
+
+ENTRY(invalid_TSS)
+ errorentry do_invalid_TSS
+
+ENTRY(segment_not_present)
+ errorentry do_segment_not_present
+
+ /* runs on exception stack */
+ENTRY(stack_segment)
+ CFI_STARTPROC
+ paranoidentry do_stack_segment
+ movq %rax,%rsp
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace
+ jmp paranoid_exit
+ CFI_ENDPROC
+
+ENTRY(general_protection)
+ errorentry do_general_protection
+
+ENTRY(alignment_check)
+ errorentry do_alignment_check
+
+ENTRY(divide_error)
+ zeroentry do_divide_error
+
+ENTRY(spurious_interrupt_bug)
+ zeroentry do_spurious_interrupt_bug
+
+#ifdef CONFIG_X86_MCE
+ /* runs on exception stack */
+ENTRY(machine_check)
+ CFI_STARTPROC
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8
+ paranoidentry do_machine_check
+ jmp paranoid_exit
+ CFI_ENDPROC
+#endif
+
+ENTRY(call_debug)
+ zeroentry do_call_debug
+
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
new file mode 100644
index 000000000000..d2c42fb99cb3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Generic APIC sub-arch probe layer.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+
+/* which logical CPU number maps to which CPU (physical APIC ID) */
+u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
+u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+extern struct genapic apic_cluster;
+extern struct genapic apic_flat;
+
+struct genapic *genapic = &apic_flat;
+
+
+/*
+ * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
+ */
+void __init clustered_apic_check(void)
+{
+ long i;
+ u8 clusters, max_cluster;
+ u8 id;
+ u8 cluster_cnt[NUM_APIC_CLUSTERS];
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ /* AMD always uses flat mode right now */
+ genapic = &apic_flat;
+ goto print;
+ }
+
+ memset(cluster_cnt, 0, sizeof(cluster_cnt));
+
+ for (i = 0; i < NR_CPUS; i++) {
+ id = bios_cpu_apicid[i];
+ if (id != BAD_APICID)
+ cluster_cnt[APIC_CLUSTERID(id)]++;
+ }
+
+ clusters = 0;
+ max_cluster = 0;
+ for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+ if (cluster_cnt[i] > 0) {
+ ++clusters;
+ if (cluster_cnt[i] > max_cluster)
+ max_cluster = cluster_cnt[i];
+ }
+ }
+
+ /*
+ * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode,
+ * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical
+ * else physical mode.
+ * (We don't use lowest priority delivery + HW APIC IRQ steering, so
+ * can ignore the clustered logical case and go straight to physical.)
+ */
+ if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster)
+ genapic = &apic_flat;
+ else
+ genapic = &apic_cluster;
+
+print:
+ printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+}
+
+/* Same for both flat and clustered. */
+
+void send_IPI_self(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
new file mode 100644
index 000000000000..9703da7202e3
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Clustered APIC subarch code. Up to 255 CPUs, physical delivery.
+ * (A more realistic maximum is around 230 CPUs.)
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static void cluster_init_apic_ldr(void)
+{
+ unsigned long val, id;
+ long i, count;
+ u8 lid;
+ u8 my_id = hard_smp_processor_id();
+ u8 my_cluster = APIC_CLUSTER(my_id);
+
+ /* Create logical APIC IDs by counting CPUs already in cluster. */
+ for (count = 0, i = NR_CPUS; --i >= 0; ) {
+ lid = x86_cpu_to_log_apicid[i];
+ if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
+ ++count;
+ }
+ /*
+ * We only have a 4 wide bitmap in cluster mode. There's no way
+ * to get above 60 CPUs and still give each one it's own bit.
+ * But, we're using physical IRQ delivery, so we don't care.
+ * Use bit 3 for the 4th through Nth CPU in each cluster.
+ */
+ if (count >= XAPIC_DEST_CPUS_SHIFT)
+ count = 3;
+ id = my_cluster | (1UL << count);
+ x86_cpu_to_log_apicid[smp_processor_id()] = id;
+ apic_write_around(APIC_DFR, APIC_DFR_CLUSTER);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write_around(APIC_LDR, val);
+}
+
+/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
+
+static cpumask_t cluster_target_cpus(void)
+{
+ return cpumask_of_cpu(0);
+}
+
+static void cluster_send_IPI_mask(cpumask_t mask, int vector)
+{
+ send_IPI_mask_sequence(mask, vector);
+}
+
+static void cluster_send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ cluster_send_IPI_mask(mask, vector);
+}
+
+static void cluster_send_IPI_all(int vector)
+{
+ cluster_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int cluster_apic_id_registered(void)
+{
+ return 1;
+}
+
+static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = first_cpu(cpumask);
+ if ((unsigned)cpu < NR_CPUS)
+ return x86_cpu_to_apicid[cpu];
+ else
+ return BAD_APICID;
+}
+
+/* cpuid returns the value latched in the HW at reset, not the APIC ID
+ * register's value. For any box whose BIOS changes APIC IDs, like
+ * clustered APIC systems, we must use hard_smp_processor_id.
+ *
+ * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
+ */
+static unsigned int phys_pkg_id(int index_msb)
+{
+ return hard_smp_processor_id() >> index_msb;
+}
+
+struct genapic apic_cluster = {
+ .name = "clustered",
+ .int_delivery_mode = dest_Fixed,
+ .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+ .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
+ .target_cpus = cluster_target_cpus,
+ .apic_id_registered = cluster_apic_id_registered,
+ .init_apic_ldr = cluster_init_apic_ldr,
+ .send_IPI_all = cluster_send_IPI_all,
+ .send_IPI_allbutself = cluster_send_IPI_allbutself,
+ .send_IPI_mask = cluster_send_IPI_mask,
+ .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid,
+ .phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
new file mode 100644
index 000000000000..b4cbbad04226
--- /dev/null
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2004 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * Flat APIC subarch code. Maximum 8 CPUs, logical delivery.
+ *
+ * Hacked for x86-64 by James Cleverdon from i386 architecture code by
+ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
+ * James Cleverdon.
+ */
+#include <linux/config.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+
+
+static cpumask_t flat_target_cpus(void)
+{
+ return cpu_online_map;
+}
+
+/*
+ * Set up the logical destination ID.
+ *
+ * Intel recommends to set DFR, LDR and TPR before enabling
+ * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
+ * document number 292116). So here it goes...
+ */
+static void flat_init_apic_ldr(void)
+{
+ unsigned long val;
+ unsigned long num, id;
+
+ num = smp_processor_id();
+ id = 1UL << num;
+ x86_cpu_to_log_apicid[num] = id;
+ apic_write_around(APIC_DFR, APIC_DFR_FLAT);
+ val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
+ val |= SET_APIC_LOGICAL_ID(id);
+ apic_write_around(APIC_LDR, val);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+ /*
+ * if there are no other CPUs in the system then
+ * we get an APIC send error if we try to broadcast.
+ * thus we have to avoid sending IPIs in this case.
+ */
+ if (num_online_cpus() > 1)
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_all(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+{
+ unsigned long mask = cpus_addr(cpumask)[0];
+ unsigned long cfg;
+ unsigned long flags;
+
+ local_save_flags(flags);
+ local_irq_disable();
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+
+ /*
+ * prepare target chip field
+ */
+ cfg = __prepare_ICR2(mask);
+ apic_write_around(APIC_ICR2, cfg);
+
+ /*
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write_around(APIC_ICR, cfg);
+ local_irq_restore(flags);
+}
+
+static int flat_apic_id_registered(void)
+{
+ return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
+}
+
+static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+ u32 ebx;
+
+ ebx = cpuid_ebx(1);
+ return ((ebx >> 24) & 0xFF) >> index_msb;
+}
+
+struct genapic apic_flat = {
+ .name = "flat",
+ .int_delivery_mode = dest_LowestPrio,
+ .int_dest_mode = (APIC_DEST_LOGICAL != 0),
+ .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
+ .target_cpus = flat_target_cpus,
+ .apic_id_registered = flat_apic_id_registered,
+ .init_apic_ldr = flat_init_apic_ldr,
+ .send_IPI_all = flat_send_IPI_all,
+ .send_IPI_allbutself = flat_send_IPI_allbutself,
+ .send_IPI_mask = flat_send_IPI_mask,
+ .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+ .phys_pkg_id = phys_pkg_id,
+};
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
new file mode 100644
index 000000000000..b6d8725c1f61
--- /dev/null
+++ b/arch/x86_64/kernel/head.S
@@ -0,0 +1,396 @@
+/*
+ * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
+ * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
+ *
+ * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
+ */
+
+
+#include <linux/linkage.h>
+#include <linux/threads.h>
+#include <asm/desc.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/cache.h>
+
+/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
+ * because we need identity-mapped pages on setup so define __START_KERNEL to
+ * 0x100000 for this stage
+ *
+ */
+
+ .text
+ .code32
+ .globl startup_32
+/* %bx: 1 if coming from smp trampoline on secondary cpu */
+startup_32:
+
+ /*
+ * At this point the CPU runs in 32bit protected mode (CS.D = 1) with
+ * paging disabled and the point of this file is to switch to 64bit
+ * long mode with a kernel mapping for kerneland to jump into the
+ * kernel virtual addresses.
+ * There is no stack until we set one up.
+ */
+
+ /* Initialize the %ds segment register */
+ movl $__KERNEL_DS,%eax
+ movl %eax,%ds
+
+ /* Load new GDT with the 64bit segments using 32bit descriptor */
+ lgdt pGDT32 - __START_KERNEL_map
+
+ /* If the CPU doesn't support CPUID this will double fault.
+ * Unfortunately it is hard to check for CPUID without a stack.
+ */
+
+ /* Check if extended functions are implemented */
+ movl $0x80000000, %eax
+ cpuid
+ cmpl $0x80000000, %eax
+ jbe no_long_mode
+ /* Check if long mode is implemented */
+ mov $0x80000001, %eax
+ cpuid
+ btl $29, %edx
+ jnc no_long_mode
+
+ /*
+ * Prepare for entering 64bits mode
+ */
+
+ /* Enable PAE mode */
+ xorl %eax, %eax
+ btsl $5, %eax
+ movl %eax, %cr4
+
+ /* Setup early boot stage 4 level pagetables */
+ movl $(init_level4_pgt - __START_KERNEL_map), %eax
+ movl %eax, %cr3
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+
+ /* Enable Long Mode */
+ btsl $_EFER_LME, %eax
+
+ /* Make changes effective */
+ wrmsr
+
+ xorl %eax, %eax
+ btsl $31, %eax /* Enable paging and in turn activate Long Mode */
+ btsl $0, %eax /* Enable protected mode */
+ /* Make changes effective */
+ movl %eax, %cr0
+ /*
+ * At this point we're in long mode but in 32bit compatibility mode
+ * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
+ * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use
+ * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
+ */
+ ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map)
+
+ .code64
+ .org 0x100
+ .globl startup_64
+startup_64:
+ /* We come here either from startup_32
+ * or directly from a 64bit bootloader.
+ * Since we may have come directly from a bootloader we
+ * reload the page tables here.
+ */
+
+ /* Enable PAE mode and PGE */
+ xorq %rax, %rax
+ btsq $5, %rax
+ btsq $7, %rax
+ movq %rax, %cr4
+
+ /* Setup early boot stage 4 level pagetables. */
+ movq $(init_level4_pgt - __START_KERNEL_map), %rax
+ movq %rax, %cr3
+
+ /* Check if nx is implemented */
+ movl $0x80000001, %eax
+ cpuid
+ movl %edx,%edi
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+
+ /* Enable System Call */
+ btsl $_EFER_SCE, %eax
+
+ /* No Execute supported? */
+ btl $20,%edi
+ jnc 1f
+ btsl $_EFER_NX, %eax
+1:
+ /* Make changes effective */
+ wrmsr
+
+ /* Setup cr0 */
+ xorq %rax, %rax
+ btsq $31, %rax /* Enable paging */
+ btsq $0, %rax /* Enable protected mode */
+ btsq $1, %rax /* Enable MP */
+ btsq $4, %rax /* Enable ET */
+ btsq $5, %rax /* Enable NE */
+ btsq $16, %rax /* Enable WP */
+ btsq $18, %rax /* Enable AM */
+ /* Make changes effective */
+ movq %rax, %cr0
+
+ /* Setup a boot time stack */
+ movq init_rsp(%rip),%rsp
+
+ /* zero EFLAGS after setting rsp */
+ pushq $0
+ popfq
+
+ /*
+ * We must switch to a new descriptor in kernel space for the GDT
+ * because soon the kernel won't have access anymore to the userspace
+ * addresses where we're currently running on. We have to do that here
+ * because in 32bit we couldn't load a 64bit linear address.
+ */
+ lgdt cpu_gdt_descr
+
+ /*
+ * Setup up a dummy PDA. this is just for some early bootup code
+ * that does in_interrupt()
+ */
+ movl $MSR_GS_BASE,%ecx
+ movq $empty_zero_page,%rax
+ movq %rax,%rdx
+ shrq $32,%rdx
+ wrmsr
+
+ /* set up data segments. actually 0 would do too */
+ movl $__KERNEL_DS,%eax
+ movl %eax,%ds
+ movl %eax,%ss
+ movl %eax,%es
+
+ /* esi is pointer to real mode structure with interesting info.
+ pass it to C */
+ movl %esi, %edi
+
+ /* Finally jump to run C code and to be on real kernel address
+ * Since we are running on identity-mapped space we have to jump
+ * to the full 64bit address , this is only possible as indirect
+ * jump
+ */
+ movq initial_code(%rip),%rax
+ jmp *%rax
+
+ /* SMP bootup changes these two */
+ .globl initial_code
+initial_code:
+ .quad x86_64_start_kernel
+ .globl init_rsp
+init_rsp:
+ .quad init_thread_union+THREAD_SIZE-8
+
+ENTRY(early_idt_handler)
+ xorl %eax,%eax
+ movq 8(%rsp),%rsi # get rip
+ movq (%rsp),%rdx
+ movq %cr2,%rcx
+ leaq early_idt_msg(%rip),%rdi
+ call early_printk
+1: hlt
+ jmp 1b
+
+early_idt_msg:
+ .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n"
+
+.code32
+ENTRY(no_long_mode)
+ /* This isn't an x86-64 CPU so hang */
+1:
+ jmp 1b
+
+.org 0xf00
+ .globl pGDT32
+pGDT32:
+ .word gdt_end-cpu_gdt_table
+ .long cpu_gdt_table-__START_KERNEL_map
+
+.org 0xf10
+ljumpvector:
+ .long startup_64-__START_KERNEL_map
+ .word __KERNEL_CS
+
+ENTRY(stext)
+ENTRY(_stext)
+
+ /*
+ * This default setting generates an ident mapping at address 0x100000
+ * and a mapping for the kernel that precisely maps virtual address
+ * 0xffffffff80000000 to physical address 0x000000. (always using
+ * 2Mbyte large pages provided by PAE mode)
+ */
+.org 0x1000
+ENTRY(init_level4_pgt)
+ .quad 0x0000000000102007 /* -> level3_ident_pgt */
+ .fill 255,8,0
+ .quad 0x000000000010a007
+ .fill 254,8,0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad 0x0000000000103007 /* -> level3_kernel_pgt */
+
+.org 0x2000
+ENTRY(level3_ident_pgt)
+ .quad 0x0000000000104007
+ .fill 511,8,0
+
+.org 0x3000
+ENTRY(level3_kernel_pgt)
+ .fill 510,8,0
+ /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
+ .quad 0x0000000000105007 /* -> level2_kernel_pgt */
+ .fill 1,8,0
+
+.org 0x4000
+ENTRY(level2_ident_pgt)
+ /* 40MB for bootup. */
+ .quad 0x0000000000000283
+ .quad 0x0000000000200183
+ .quad 0x0000000000400183
+ .quad 0x0000000000600183
+ .quad 0x0000000000800183
+ .quad 0x0000000000A00183
+ .quad 0x0000000000C00183
+ .quad 0x0000000000E00183
+ .quad 0x0000000001000183
+ .quad 0x0000000001200183
+ .quad 0x0000000001400183
+ .quad 0x0000000001600183
+ .quad 0x0000000001800183
+ .quad 0x0000000001A00183
+ .quad 0x0000000001C00183
+ .quad 0x0000000001E00183
+ .quad 0x0000000002000183
+ .quad 0x0000000002200183
+ .quad 0x0000000002400183
+ .quad 0x0000000002600183
+ /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */
+ .globl temp_boot_pmds
+temp_boot_pmds:
+ .fill 492,8,0
+
+.org 0x5000
+ENTRY(level2_kernel_pgt)
+ /* 40MB kernel mapping. The kernel code cannot be bigger than that.
+ When you change this change KERNEL_TEXT_SIZE in page.h too. */
+ /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */
+ .quad 0x0000000000000183
+ .quad 0x0000000000200183
+ .quad 0x0000000000400183
+ .quad 0x0000000000600183
+ .quad 0x0000000000800183
+ .quad 0x0000000000A00183
+ .quad 0x0000000000C00183
+ .quad 0x0000000000E00183
+ .quad 0x0000000001000183
+ .quad 0x0000000001200183
+ .quad 0x0000000001400183
+ .quad 0x0000000001600183
+ .quad 0x0000000001800183
+ .quad 0x0000000001A00183
+ .quad 0x0000000001C00183
+ .quad 0x0000000001E00183
+ .quad 0x0000000002000183
+ .quad 0x0000000002200183
+ .quad 0x0000000002400183
+ .quad 0x0000000002600183
+ /* Module mapping starts here */
+ .fill 492,8,0
+
+.org 0x6000
+ENTRY(empty_zero_page)
+
+.org 0x7000
+ENTRY(empty_bad_page)
+
+.org 0x8000
+ENTRY(empty_bad_pte_table)
+
+.org 0x9000
+ENTRY(empty_bad_pmd_table)
+
+.org 0xa000
+ENTRY(level3_physmem_pgt)
+ .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+
+ .org 0xb000
+#ifdef CONFIG_ACPI_SLEEP
+ENTRY(wakeup_level4_pgt)
+ .quad 0x0000000000102007 /* -> level3_ident_pgt */
+ .fill 255,8,0
+ .quad 0x000000000010a007
+ .fill 254,8,0
+ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ .quad 0x0000000000103007 /* -> level3_kernel_pgt */
+#endif
+
+ .data
+
+ .align 16
+ .globl cpu_gdt_descr
+cpu_gdt_descr:
+ .word gdt_end-cpu_gdt_table
+gdt:
+ .quad cpu_gdt_table
+#ifdef CONFIG_SMP
+ .rept NR_CPUS-1
+ .word 0
+ .quad 0
+ .endr
+#endif
+
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+
+.align L1_CACHE_BYTES
+
+/* The TLS descriptors are currently at a different place compared to i386.
+ Hopefully nobody expects them at a fixed place (Wine?) */
+
+ENTRY(cpu_gdt_table)
+ .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x008f9a000000ffff /* __KERNEL_COMPAT32_CS */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+ .quad 0x00cffa000000ffff /* __USER32_CS */
+ .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */
+ .quad 0x00affa000000ffff /* __USER_CS */
+ .quad 0x00cf9a000000ffff /* __KERNEL32_CS */
+ .quad 0,0 /* TSS */
+ .quad 0,0 /* LDT */
+ .quad 0,0,0 /* three TLS descriptors */
+ .quad 0x00009a000000ffff /* __KERNEL16_CS - 16bit PM for S3 wakeup. */
+ /* base must be patched for real base address. */
+gdt_end:
+ /* asm/segment.h:GDT_ENTRIES must match this */
+ /* This should be a multiple of the cache line size */
+ /* GDTs of other CPUs: */
+ .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table)
+
+ .align L1_CACHE_BYTES
+ENTRY(idt_table)
+ .rept 256
+ .quad 0
+ .quad 0
+ .endr
+
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
new file mode 100644
index 000000000000..6cad46c98a23
--- /dev/null
+++ b/arch/x86_64/kernel/head64.c
@@ -0,0 +1,117 @@
+/*
+ * linux/arch/x86_64/kernel/head64.c -- prepare to run common code
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ *
+ * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
+ */
+
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/percpu.h>
+
+#include <asm/processor.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/bootsetup.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+
+/* Don't add a printk in there. printk relies on the PDA which is not initialized
+ yet. */
+static void __init clear_bss(void)
+{
+ extern char __bss_start[], __bss_end[];
+ memset(__bss_start, 0,
+ (unsigned long) __bss_end - (unsigned long) __bss_start);
+}
+
+extern char x86_boot_params[2048];
+
+#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
+#define OLD_CL_MAGIC_ADDR 0x90020
+#define OLD_CL_MAGIC 0xA33F
+#define OLD_CL_BASE_ADDR 0x90000
+#define OLD_CL_OFFSET 0x90022
+
+extern char saved_command_line[];
+
+static void __init copy_bootdata(char *real_mode_data)
+{
+ int new_data;
+ char * command_line;
+
+ memcpy(x86_boot_params, real_mode_data, 2048);
+ new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
+ if (!new_data) {
+ if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
+ printk("so old bootloader that it does not support commandline?!\n");
+ return;
+ }
+ new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
+ printk("old bootloader convention, maybe loadlin?\n");
+ }
+ command_line = (char *) ((u64)(new_data));
+ memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
+ printk("Bootdata ok (command line is %s)\n", saved_command_line);
+}
+
+static void __init setup_boot_cpu_data(void)
+{
+ unsigned int dummy, eax;
+
+ /* get vendor info */
+ cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
+ (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
+ (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
+ (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
+
+ /* get cpu type */
+ cpuid(1, &eax, &dummy, &dummy,
+ (unsigned int *) &boot_cpu_data.x86_capability);
+ boot_cpu_data.x86 = (eax >> 8) & 0xf;
+ boot_cpu_data.x86_model = (eax >> 4) & 0xf;
+ boot_cpu_data.x86_mask = eax & 0xf;
+}
+
+extern char _end[];
+
+void __init x86_64_start_kernel(char * real_mode_data)
+{
+ char *s;
+ int i;
+
+ for (i = 0; i < 256; i++)
+ set_intr_gate(i, early_idt_handler);
+ asm volatile("lidt %0" :: "m" (idt_descr));
+ clear_bss();
+ pda_init(0);
+ copy_bootdata(real_mode_data);
+#ifdef CONFIG_SMP
+ cpu_set(0, cpu_online_map);
+#endif
+ /* default console: */
+ if (!strstr(saved_command_line, "console="))
+ strcat(saved_command_line, " console=tty0");
+ s = strstr(saved_command_line, "earlyprintk=");
+ if (s != NULL)
+ setup_early_printk(s);
+#ifdef CONFIG_DISCONTIGMEM
+ s = strstr(saved_command_line, "numa=");
+ if (s != NULL)
+ numa_setup(s+5);
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ if (strstr(saved_command_line, "disableapic"))
+ disable_apic = 1;
+#endif
+ /* You need early console to see that */
+ if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
+ panic("Kernel too big for kernel mapping\n");
+
+ setup_boot_cpu_data();
+ start_kernel();
+}
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
new file mode 100644
index 000000000000..ba139cac57ce
--- /dev/null
+++ b/arch/x86_64/kernel/i387.c
@@ -0,0 +1,155 @@
+/*
+ * linux/arch/x86_64/kernel/i387.c
+ *
+ * Copyright (C) 1994 Linus Torvalds
+ * Copyright (C) 2002 Andi Kleen, SuSE Labs
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ * Gareth Hughes <gareth@valinux.com>, May 2000
+ *
+ * x86-64 rework 2002 Andi Kleen.
+ * Does direct fxsave in and out of user space now for signal handlers.
+ * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation,
+ * the 64bit user space sees a FXSAVE frame directly.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/sigcontext.h>
+#include <asm/user.h>
+#include <asm/ptrace.h>
+#include <asm/uaccess.h>
+
+unsigned int mxcsr_feature_mask = 0xffffffff;
+
+void mxcsr_feature_mask_init(void)
+{
+ unsigned int mask;
+ clts();
+ memset(&current->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+ asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
+ mask = current->thread.i387.fxsave.mxcsr_mask;
+ if (mask == 0) mask = 0x0000ffbf;
+ mxcsr_feature_mask &= mask;
+ stts();
+}
+
+/*
+ * Called at bootup to set up the initial FPU state that is later cloned
+ * into all processes.
+ */
+void __init fpu_init(void)
+{
+ unsigned long oldcr0 = read_cr0();
+ extern void __bad_fxsave_alignment(void);
+
+ if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
+ __bad_fxsave_alignment();
+ set_in_cr4(X86_CR4_OSFXSR);
+ set_in_cr4(X86_CR4_OSXMMEXCPT);
+
+ write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
+
+ mxcsr_feature_mask_init();
+ /* clean state in init */
+ current_thread_info()->status = 0;
+ clear_used_math();
+}
+
+void init_fpu(struct task_struct *child)
+{
+ if (tsk_used_math(child)) {
+ if (child == current)
+ unlazy_fpu(child);
+ return;
+ }
+ memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct));
+ child->thread.i387.fxsave.cwd = 0x37f;
+ child->thread.i387.fxsave.mxcsr = 0x1f80;
+ /* only the device not available exception or ptrace can call init_fpu */
+ set_stopped_child_used_math(child);
+}
+
+/*
+ * Signal frame handlers.
+ */
+
+int save_i387(struct _fpstate __user *buf)
+{
+ struct task_struct *tsk = current;
+ int err = 0;
+
+ {
+ extern void bad_user_i387_struct(void);
+ if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave))
+ bad_user_i387_struct();
+ }
+
+ if ((unsigned long)buf % 16)
+ printk("save_i387: bad fpstate %p\n",buf);
+
+ if (!used_math())
+ return 0;
+ clear_used_math(); /* trigger finit */
+ if (tsk->thread_info->status & TS_USEDFPU) {
+ err = save_i387_checking((struct i387_fxsave_struct __user *)buf);
+ if (err) return err;
+ stts();
+ } else {
+ if (__copy_to_user(buf, &tsk->thread.i387.fxsave,
+ sizeof(struct i387_fxsave_struct)))
+ return -1;
+ }
+ return 1;
+}
+
+/*
+ * ptrace request handlers.
+ */
+
+int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk)
+{
+ init_fpu(tsk);
+ return __copy_to_user(buf, &tsk->thread.i387.fxsave,
+ sizeof(struct user_i387_struct)) ? -EFAULT : 0;
+}
+
+int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf)
+{
+ if (__copy_from_user(&tsk->thread.i387.fxsave, buf,
+ sizeof(struct user_i387_struct)))
+ return -EFAULT;
+ return 0;
+}
+
+/*
+ * FPU state for core dumps.
+ */
+
+int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu )
+{
+ struct task_struct *tsk = current;
+
+ if (!used_math())
+ return 0;
+
+ unlazy_fpu(tsk);
+ memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
+ return 1;
+}
+
+int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu)
+{
+ int fpvalid = !!tsk_used_math(tsk);
+
+ if (fpvalid) {
+ if (tsk == current)
+ unlazy_fpu(tsk);
+ memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct));
+}
+ return fpvalid;
+}
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
new file mode 100644
index 000000000000..d52701246010
--- /dev/null
+++ b/arch/x86_64/kernel/i8259.c
@@ -0,0 +1,579 @@
+#include <linux/linkage.h>
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/smp_lock.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+
+#include <asm/acpi.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/delay.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+
+#include <linux/irq.h>
+
+/*
+ * Common place to define all x86 IRQ vectors
+ *
+ * This builds up the IRQ handler stubs using some ugly macros in irq.h
+ *
+ * These macros create the low-level assembly IRQ routines that save
+ * register context and call do_IRQ(). do_IRQ() then does all the
+ * operations that are needed to keep the AT (or SMP IOAPIC)
+ * interrupt-controller happy.
+ */
+
+#define BI(x,y) \
+ BUILD_IRQ(x##y)
+
+#define BUILD_16_IRQS(x) \
+ BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+ BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+ BI(x,c) BI(x,d) BI(x,e) BI(x,f)
+
+#define BUILD_14_IRQS(x) \
+ BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
+ BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
+ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
+ BI(x,c) BI(x,d)
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x20-0x2f)
+ */
+BUILD_16_IRQS(0x0)
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+ BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
+BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
+BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
+BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
+
+#ifdef CONFIG_PCI_MSI
+ BUILD_14_IRQS(0xe)
+#endif
+
+#endif
+
+#undef BUILD_16_IRQS
+#undef BUILD_14_IRQS
+#undef BI
+
+
+#define IRQ(x,y) \
+ IRQ##x##y##_interrupt
+
+#define IRQLIST_16(x) \
+ IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+ IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+ IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
+
+#define IRQLIST_14(x) \
+ IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
+ IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
+ IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
+ IRQ(x,c), IRQ(x,d)
+
+void (*interrupt[NR_IRQS])(void) = {
+ IRQLIST_16(0x0),
+
+#ifdef CONFIG_X86_IO_APIC
+ IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
+ IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
+ IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
+ IRQLIST_16(0xc), IRQLIST_16(0xd)
+
+#ifdef CONFIG_PCI_MSI
+ , IRQLIST_14(0xe)
+#endif
+
+#endif
+};
+
+#undef IRQ
+#undef IRQLIST_16
+#undef IRQLIST_14
+
+/*
+ * This is the 'legacy' 8259A Programmable Interrupt Controller,
+ * present in the majority of PC/AT boxes.
+ * plus some generic x86 specific things if generic specifics makes
+ * any sense at all.
+ * this file should become arch/i386/kernel/irq.c when the old irq.c
+ * moves to arch independent land
+ */
+
+DEFINE_SPINLOCK(i8259A_lock);
+
+static void end_8259A_irq (unsigned int irq)
+{
+ if (irq > 256) {
+ char var;
+ printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, current->thread_info);
+
+ BUG();
+ }
+
+ if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
+ irq_desc[irq].action)
+ enable_8259A_irq(irq);
+}
+
+#define shutdown_8259A_irq disable_8259A_irq
+
+static void mask_and_ack_8259A(unsigned int);
+
+static unsigned int startup_8259A_irq(unsigned int irq)
+{
+ enable_8259A_irq(irq);
+ return 0; /* never anything pending */
+}
+
+static struct hw_interrupt_type i8259A_irq_type = {
+ "XT-PIC",
+ startup_8259A_irq,
+ shutdown_8259A_irq,
+ enable_8259A_irq,
+ disable_8259A_irq,
+ mask_and_ack_8259A,
+ end_8259A_irq,
+ NULL
+};
+
+/*
+ * 8259A PIC functions to handle ISA devices:
+ */
+
+/*
+ * This contains the irq mask for both 8259A irq controllers,
+ */
+static unsigned int cached_irq_mask = 0xffff;
+
+#define __byte(x,y) (((unsigned char *)&(y))[x])
+#define cached_21 (__byte(0,cached_irq_mask))
+#define cached_A1 (__byte(1,cached_irq_mask))
+
+/*
+ * Not all IRQs can be routed through the IO-APIC, eg. on certain (older)
+ * boards the timer interrupt is not really connected to any IO-APIC pin,
+ * it's fed to the master 8259A's IR0 line only.
+ *
+ * Any '1' bit in this mask means the IRQ is routed through the IO-APIC.
+ * this 'mixed mode' IRQ handling costs nothing because it's only used
+ * at IRQ setup time.
+ */
+unsigned long io_apic_irqs;
+
+void disable_8259A_irq(unsigned int irq)
+{
+ unsigned int mask = 1 << irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask |= mask;
+ if (irq & 8)
+ outb(cached_A1,0xA1);
+ else
+ outb(cached_21,0x21);
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void enable_8259A_irq(unsigned int irq)
+{
+ unsigned int mask = ~(1 << irq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+ cached_irq_mask &= mask;
+ if (irq & 8)
+ outb(cached_A1,0xA1);
+ else
+ outb(cached_21,0x21);
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+int i8259A_irq_pending(unsigned int irq)
+{
+ unsigned int mask = 1<<irq;
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+ if (irq < 8)
+ ret = inb(0x20) & mask;
+ else
+ ret = inb(0xA0) & (mask >> 8);
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ return ret;
+}
+
+void make_8259A_irq(unsigned int irq)
+{
+ disable_irq_nosync(irq);
+ io_apic_irqs &= ~(1<<irq);
+ irq_desc[irq].handler = &i8259A_irq_type;
+ enable_irq(irq);
+}
+
+/*
+ * This function assumes to be called rarely. Switching between
+ * 8259A registers is slow.
+ * This has to be protected by the irq controller spinlock
+ * before being called.
+ */
+static inline int i8259A_irq_real(unsigned int irq)
+{
+ int value;
+ int irqmask = 1<<irq;
+
+ if (irq < 8) {
+ outb(0x0B,0x20); /* ISR register */
+ value = inb(0x20) & irqmask;
+ outb(0x0A,0x20); /* back to the IRR register */
+ return value;
+ }
+ outb(0x0B,0xA0); /* ISR register */
+ value = inb(0xA0) & (irqmask >> 8);
+ outb(0x0A,0xA0); /* back to the IRR register */
+ return value;
+}
+
+/*
+ * Careful! The 8259A is a fragile beast, it pretty
+ * much _has_ to be done exactly like this (mask it
+ * first, _then_ send the EOI, and the order of EOI
+ * to the two 8259s is important!
+ */
+static void mask_and_ack_8259A(unsigned int irq)
+{
+ unsigned int irqmask = 1 << irq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+ /*
+ * Lightweight spurious IRQ detection. We do not want
+ * to overdo spurious IRQ handling - it's usually a sign
+ * of hardware problems, so we only do the checks we can
+ * do without slowing down good hardware unnecesserily.
+ *
+ * Note that IRQ7 and IRQ15 (the two spurious IRQs
+ * usually resulting from the 8259A-1|2 PICs) occur
+ * even if the IRQ is masked in the 8259A. Thus we
+ * can check spurious 8259A IRQs without doing the
+ * quite slow i8259A_irq_real() call for every IRQ.
+ * This does not cover 100% of spurious interrupts,
+ * but should be enough to warn the user that there
+ * is something bad going on ...
+ */
+ if (cached_irq_mask & irqmask)
+ goto spurious_8259A_irq;
+ cached_irq_mask |= irqmask;
+
+handle_real_irq:
+ if (irq & 8) {
+ inb(0xA1); /* DUMMY - (do we need this?) */
+ outb(cached_A1,0xA1);
+ outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */
+ outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */
+ } else {
+ inb(0x21); /* DUMMY - (do we need this?) */
+ outb(cached_21,0x21);
+ outb(0x60+irq,0x20); /* 'Specific EOI' to master */
+ }
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+ return;
+
+spurious_8259A_irq:
+ /*
+ * this is the slow path - should happen rarely.
+ */
+ if (i8259A_irq_real(irq))
+ /*
+ * oops, the IRQ _is_ in service according to the
+ * 8259A - not spurious, go handle it.
+ */
+ goto handle_real_irq;
+
+ {
+ static int spurious_irq_mask;
+ /*
+ * At this point we can be sure the IRQ is spurious,
+ * lets ACK and report it. [once per IRQ]
+ */
+ if (!(spurious_irq_mask & irqmask)) {
+ printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq);
+ spurious_irq_mask |= irqmask;
+ }
+ atomic_inc(&irq_err_count);
+ /*
+ * Theoretically we do not have to handle this IRQ,
+ * but in Linux this does not cause problems and is
+ * simpler for us.
+ */
+ goto handle_real_irq;
+ }
+}
+
+void init_8259A(int auto_eoi)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+
+ outb(0xff, 0x21); /* mask all of 8259A-1 */
+ outb(0xff, 0xA1); /* mask all of 8259A-2 */
+
+ /*
+ * outb_p - this has to work on a wide range of PC hardware.
+ */
+ outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */
+ outb_p(0x20 + 0, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */
+ outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */
+ if (auto_eoi)
+ outb_p(0x03, 0x21); /* master does Auto EOI */
+ else
+ outb_p(0x01, 0x21); /* master expects normal EOI */
+
+ outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */
+ outb_p(0x20 + 8, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */
+ outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */
+ outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode
+ is to be investigated) */
+
+ if (auto_eoi)
+ /*
+ * in AEOI mode we just have to mask the interrupt
+ * when acking.
+ */
+ i8259A_irq_type.ack = disable_8259A_irq;
+ else
+ i8259A_irq_type.ack = mask_and_ack_8259A;
+
+ udelay(100); /* wait for 8259A to initialize */
+
+ outb(cached_21, 0x21); /* restore master IRQ mask */
+ outb(cached_A1, 0xA1); /* restore slave IRQ mask */
+
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+static char irq_trigger[2];
+/**
+ * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
+ */
+static void restore_ELCR(char *trigger)
+{
+ outb(trigger[0], 0x4d0);
+ outb(trigger[1], 0x4d1);
+}
+
+static void save_ELCR(char *trigger)
+{
+ /* IRQ 0,1,2,8,13 are marked as reserved */
+ trigger[0] = inb(0x4d0) & 0xF8;
+ trigger[1] = inb(0x4d1) & 0xDE;
+}
+
+static int i8259A_resume(struct sys_device *dev)
+{
+ init_8259A(0);
+ restore_ELCR(irq_trigger);
+ return 0;
+}
+
+static int i8259A_suspend(struct sys_device *dev, u32 state)
+{
+ save_ELCR(irq_trigger);
+ return 0;
+}
+
+static struct sysdev_class i8259_sysdev_class = {
+ set_kset_name("i8259"),
+ .suspend = i8259A_suspend,
+ .resume = i8259A_resume,
+};
+
+static struct sys_device device_i8259A = {
+ .id = 0,
+ .cls = &i8259_sysdev_class,
+};
+
+static int __init i8259A_init_sysfs(void)
+{
+ int error = sysdev_class_register(&i8259_sysdev_class);
+ if (!error)
+ error = sysdev_register(&device_i8259A);
+ return error;
+}
+
+device_initcall(i8259A_init_sysfs);
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+
+static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+
+void __init init_ISA_irqs (void)
+{
+ int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ init_bsp_APIC();
+#endif
+ init_8259A(0);
+
+ for (i = 0; i < NR_IRQS; i++) {
+ irq_desc[i].status = IRQ_DISABLED;
+ irq_desc[i].action = NULL;
+ irq_desc[i].depth = 1;
+
+ if (i < 16) {
+ /*
+ * 16 old-style INTA-cycle interrupts:
+ */
+ irq_desc[i].handler = &i8259A_irq_type;
+ } else {
+ /*
+ * 'high' PCI IRQs filled in on demand
+ */
+ irq_desc[i].handler = &no_irq_type;
+ }
+ }
+}
+
+void apic_timer_interrupt(void);
+void spurious_interrupt(void);
+void error_interrupt(void);
+void reschedule_interrupt(void);
+void call_function_interrupt(void);
+void invalidate_interrupt(void);
+void thermal_interrupt(void);
+void i8254_timer_resume(void);
+
+static void setup_timer(void)
+{
+ outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */
+ udelay(10);
+ outb_p(LATCH & 0xff , 0x40); /* LSB */
+ udelay(10);
+ outb(LATCH >> 8 , 0x40); /* MSB */
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+ setup_timer();
+ return 0;
+}
+
+void i8254_timer_resume(void)
+{
+ setup_timer();
+}
+
+static struct sysdev_class timer_sysclass = {
+ set_kset_name("timer"),
+ .resume = timer_resume,
+};
+
+static struct sys_device device_timer = {
+ .id = 0,
+ .cls = &timer_sysclass,
+};
+
+static int __init init_timer_sysfs(void)
+{
+ int error = sysdev_class_register(&timer_sysclass);
+ if (!error)
+ error = sysdev_register(&device_timer);
+ return error;
+}
+
+device_initcall(init_timer_sysfs);
+
+void __init init_IRQ(void)
+{
+ int i;
+
+ init_ISA_irqs();
+ /*
+ * Cover the whole vector space, no vector can escape
+ * us. (some of these will be overridden and become
+ * 'special' SMP interrupts)
+ */
+ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+ int vector = FIRST_EXTERNAL_VECTOR + i;
+ if (i >= NR_IRQS)
+ break;
+ if (vector != IA32_SYSCALL_VECTOR && vector != KDB_VECTOR) {
+ set_intr_gate(vector, interrupt[i]);
+ }
+ }
+
+#ifdef CONFIG_SMP
+ /*
+ * IRQ0 must be given a fixed assignment and initialized,
+ * because it's used before the IO-APIC is set up.
+ */
+ set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+
+ /*
+ * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+ * IPI, driven by wakeup.
+ */
+ set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+ /* IPI for invalidation */
+ set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+ /* IPI for generic function call */
+ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+#endif
+ set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ /* self generated IPI for local APIC timer */
+ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+ /* IPI vectors for APIC spurious and error interrupts */
+ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+ set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+
+ /*
+ * Set the clock to HZ Hz, we already have a valid
+ * vector now:
+ */
+ setup_timer();
+
+ if (!acpi_ioapic)
+ setup_irq(2, &irq2);
+}
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
new file mode 100644
index 000000000000..c4dc91491157
--- /dev/null
+++ b/arch/x86_64/kernel/init_task.c
@@ -0,0 +1,49 @@
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/fs.h>
+#include <linux/mqueue.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+
+static struct fs_struct init_fs = INIT_FS;
+static struct files_struct init_files = INIT_FILES;
+static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
+static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
+struct mm_struct init_mm = INIT_MM(init_mm);
+
+EXPORT_SYMBOL(init_mm);
+
+/*
+ * Initial task structure.
+ *
+ * We need to make sure that this is 8192-byte aligned due to the
+ * way process stacks are handled. This is done by having a special
+ * "init_task" linker map entry..
+ */
+union thread_union init_thread_union
+ __attribute__((__section__(".data.init_task"))) =
+ { INIT_THREAD_INFO(init_task) };
+
+/*
+ * Initial task structure.
+ *
+ * All other task structs will be allocated on slabs in fork.c
+ */
+struct task_struct init_task = INIT_TASK(init_task);
+
+EXPORT_SYMBOL(init_task);
+/*
+ * per-CPU TSS segments. Threads are completely 'soft' on Linux,
+ * no more per-task TSS's. The TSS size is kept cacheline-aligned
+ * so they are allowed to end up in the .data.cacheline_aligned
+ * section. Since TSS's are completely CPU-local, we want them
+ * on exact cacheline boundaries, to eliminate cacheline ping-pong.
+ */
+DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp;
+
+#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
new file mode 100644
index 000000000000..adebe70c180e
--- /dev/null
+++ b/arch/x86_64/kernel/io_apic.c
@@ -0,0 +1,1982 @@
+/*
+ * Intel IO-APIC support for multi-Pentium hosts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ * Many thanks to Stig Venaas for trying out countless experimental
+ * patches and reporting/debugging problems patiently!
+ *
+ * (c) 1999, Multiple IO-APIC support, developed by
+ * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
+ * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
+ * further tested and cleaned up by Zach Brown <zab@redhat.com>
+ * and Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes
+ * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
+ * thanks to Eric Gilmore
+ * and Rolf G. Tews
+ * for testing these extensively
+ * Paul Diefenbaugh : Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/smp_lock.h>
+#include <linux/mc146818rtc.h>
+#include <linux/acpi.h>
+#include <linux/sysdev.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+#include <asm/mach_apic.h>
+
+#define __apicdebuginit __init
+
+int sis_apic_bug; /* not actually supported, dummy for compile */
+
+static DEFINE_SPINLOCK(ioapic_lock);
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+static struct irq_pin_list {
+ short apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+
+int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+#ifdef CONFIG_PCI_MSI
+#define vector_to_irq(vector) \
+ (platform_legacy_irq(vector) ? vector : vector_irq[vector])
+#else
+#define vector_to_irq(vector) (vector)
+#endif
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+ static int first_free_entry = NR_IRQS;
+ struct irq_pin_list *entry = irq_2_pin + irq;
+
+ while (entry->next)
+ entry = irq_2_pin + entry->next;
+
+ if (entry->pin != -1) {
+ entry->next = first_free_entry;
+ entry = irq_2_pin + entry->next;
+ if (++first_free_entry >= PIN_MAP_SIZE)
+ panic("io_apic.c: whoops");
+ }
+ entry->apic = apic;
+ entry->pin = pin;
+}
+
+#define __DO_ACTION(R, ACTION, FINAL) \
+ \
+{ \
+ int pin; \
+ struct irq_pin_list *entry = irq_2_pin + irq; \
+ \
+ for (;;) { \
+ unsigned int reg; \
+ pin = entry->pin; \
+ if (pin == -1) \
+ break; \
+ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
+ reg ACTION; \
+ io_apic_modify(entry->apic, reg); \
+ if (!entry->next) \
+ break; \
+ entry = irq_2_pin + entry->next; \
+ } \
+ FINAL; \
+}
+
+#define DO_ACTION(name,R,ACTION, FINAL) \
+ \
+ static void name##_IO_APIC_irq (unsigned int irq) \
+ __DO_ACTION(R, ACTION, FINAL)
+
+DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
+ /* mask = 1 */
+DO_ACTION( __unmask, 0, &= 0xfffeffff, )
+ /* mask = 0 */
+
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __mask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __unmask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ /* Check delivery_mode to be sure we're not clearing an SMI pin */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
+ *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (entry.delivery_mode == dest_SMI)
+ return;
+ /*
+ * Disable it in the IO-APIC irq-routing table:
+ */
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 1;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+ io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC (void)
+{
+ int apic, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+ clear_IO_APIC_pin(apic, pin);
+}
+
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+int skip_ioapic_setup;
+int ioapic_force;
+
+/* dummy parsing: see setup.c */
+
+static int __init disable_ioapic_setup(char *str)
+{
+ skip_ioapic_setup = 1;
+ return 1;
+}
+
+static int __init enable_ioapic_setup(char *str)
+{
+ ioapic_force = 1;
+ skip_ioapic_setup = 0;
+ return 1;
+}
+
+__setup("noapic", disable_ioapic_setup);
+__setup("apic", enable_ioapic_setup);
+
+#include <asm/pci-direct.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+
+/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
+ off. Check for an Nvidia or VIA PCI bridge and turn it off.
+ Use pci direct infrastructure because this runs before the PCI subsystem.
+
+ Can be overwritten with "apic"
+
+ And another hack to disable the IOMMU on VIA chipsets.
+
+ Kludge-O-Rama. */
+void __init check_ioapic(void)
+{
+ int num,slot,func;
+ if (ioapic_force)
+ return;
+
+ /* Poor man's PCI discovery */
+ for (num = 0; num < 32; num++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u32 class;
+ u32 vendor;
+ u8 type;
+ class = read_pci_config(num,slot,func,
+ PCI_CLASS_REVISION);
+ if (class == 0xffffffff)
+ break;
+
+ if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+ continue;
+
+ vendor = read_pci_config(num, slot, func,
+ PCI_VENDOR_ID);
+ vendor &= 0xffff;
+ switch (vendor) {
+ case PCI_VENDOR_ID_VIA:
+#ifdef CONFIG_GART_IOMMU
+ if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) ||
+ force_iommu) &&
+ !iommu_aperture_allowed) {
+ printk(KERN_INFO
+ "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n");
+ iommu_aperture_disabled = 1;
+ }
+#endif
+ return;
+ case PCI_VENDOR_ID_NVIDIA:
+#ifdef CONFIG_ACPI
+ /* All timer overrides on Nvidia
+ seem to be wrong. Skip them. */
+ acpi_skip_timer_override = 1;
+ printk(KERN_INFO
+ "Nvidia board detected. Ignoring ACPI timer override.\n");
+#endif
+ /* RED-PEN skip them on mptables too? */
+ return;
+ }
+
+ /* No multi-function device? */
+ type = read_pci_config_byte(num,slot,func,
+ PCI_HEADER_TYPE);
+ if (!(type & 0x80))
+ break;
+ }
+ }
+ }
+}
+
+static int __init ioapic_pirq_setup(char *str)
+{
+ int i, max;
+ int ints[MAX_PIRQS+1];
+
+ get_options(str, ARRAY_SIZE(ints), ints);
+
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+
+ pirqs_enabled = 1;
+ apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
+ max = MAX_PIRQS;
+ if (ints[0] < MAX_PIRQS)
+ max = ints[0];
+
+ for (i = 0; i < max; i++) {
+ apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+ /*
+ * PIRQs are mapped upside down, usually.
+ */
+ pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+ }
+ return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mpc_irqtype == type &&
+ (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+ mp_irqs[i].mpc_dstirq == pin)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int __init find_isa_irq_pin(int irq, int type)
+{
+ int i;
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].mpc_srcbus;
+
+ if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+ mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+ mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+ (mp_irqs[i].mpc_irqtype == type) &&
+ (mp_irqs[i].mpc_srcbusirq == irq))
+
+ return mp_irqs[i].mpc_dstirq;
+ }
+ return -1;
+}
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+ int apic, i, best_guess = -1;
+
+ apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+ bus, slot, pin);
+ if (mp_bus_id_to_pci_bus[bus] == -1) {
+ apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+ return -1;
+ }
+ for (i = 0; i < mp_irq_entries; i++) {
+ int lbus = mp_irqs[i].mpc_srcbus;
+
+ for (apic = 0; apic < nr_ioapics; apic++)
+ if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+ mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+ break;
+
+ if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+ !mp_irqs[i].mpc_irqtype &&
+ (bus == lbus) &&
+ (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+ int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+
+ if (!(apic || IO_APIC_IRQ(irq)))
+ continue;
+
+ if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+ return irq;
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_guess < 0)
+ best_guess = irq;
+ }
+ }
+ return best_guess;
+}
+
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+ if (irq < 16) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
+ return 0;
+}
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_polarity(idx) (0)
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx) (0)
+#define default_ISA_polarity(idx) (0)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx) (1)
+#define default_PCI_polarity(idx) (1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx) (1)
+#define default_MCA_polarity(idx) (0)
+
+static int __init MPBIOS_polarity(int idx)
+{
+ int bus = mp_irqs[idx].mpc_srcbus;
+ int polarity;
+
+ /*
+ * Determine IRQ line polarity (high active or low active):
+ */
+ switch (mp_irqs[idx].mpc_irqflag & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent polarity */
+ {
+ switch (mp_bus_id_to_type[bus])
+ {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ polarity = default_ISA_polarity(idx);
+ break;
+ }
+ case MP_BUS_EISA: /* EISA pin */
+ {
+ polarity = default_EISA_polarity(idx);
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ polarity = default_PCI_polarity(idx);
+ break;
+ }
+ case MP_BUS_MCA: /* MCA pin */
+ {
+ polarity = default_MCA_polarity(idx);
+ break;
+ }
+ default:
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ }
+ break;
+ }
+ case 1: /* high active */
+ {
+ polarity = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ case 3: /* low active */
+ {
+ polarity = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ polarity = 1;
+ break;
+ }
+ }
+ return polarity;
+}
+
+static int MPBIOS_trigger(int idx)
+{
+ int bus = mp_irqs[idx].mpc_srcbus;
+ int trigger;
+
+ /*
+ * Determine IRQ trigger mode (edge or level sensitive):
+ */
+ switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+ {
+ case 0: /* conforms, ie. bus-type dependent */
+ {
+ switch (mp_bus_id_to_type[bus])
+ {
+ case MP_BUS_ISA: /* ISA pin */
+ {
+ trigger = default_ISA_trigger(idx);
+ break;
+ }
+ case MP_BUS_EISA: /* EISA pin */
+ {
+ trigger = default_EISA_trigger(idx);
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ trigger = default_PCI_trigger(idx);
+ break;
+ }
+ case MP_BUS_MCA: /* MCA pin */
+ {
+ trigger = default_MCA_trigger(idx);
+ break;
+ }
+ default:
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ }
+ break;
+ }
+ case 1: /* edge */
+ {
+ trigger = 0;
+ break;
+ }
+ case 2: /* reserved */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 1;
+ break;
+ }
+ case 3: /* level */
+ {
+ trigger = 1;
+ break;
+ }
+ default: /* invalid */
+ {
+ printk(KERN_WARNING "broken BIOS!!\n");
+ trigger = 0;
+ break;
+ }
+ }
+ return trigger;
+}
+
+static inline int irq_polarity(int idx)
+{
+ return MPBIOS_polarity(idx);
+}
+
+static inline int irq_trigger(int idx)
+{
+ return MPBIOS_trigger(idx);
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+ int irq, i;
+ int bus = mp_irqs[idx].mpc_srcbus;
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].mpc_dstirq != pin)
+ printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+ switch (mp_bus_id_to_type[bus])
+ {
+ case MP_BUS_ISA: /* ISA pin */
+ case MP_BUS_EISA:
+ case MP_BUS_MCA:
+ {
+ irq = mp_irqs[idx].mpc_srcbusirq;
+ break;
+ }
+ case MP_BUS_PCI: /* PCI pin */
+ {
+ /*
+ * PCI IRQs are mapped in order
+ */
+ i = irq = 0;
+ while (i < apic)
+ irq += nr_ioapic_registers[i++];
+ irq += pin;
+ break;
+ }
+ default:
+ {
+ printk(KERN_ERR "unknown bus type %d.\n",bus);
+ irq = 0;
+ break;
+ }
+ }
+
+ /*
+ * PCI IRQ command line redirection. Yes, limits are hardcoded.
+ */
+ if ((pin >= 16) && (pin <= 23)) {
+ if (pirq_entries[pin-16] != -1) {
+ if (!pirq_entries[pin-16]) {
+ apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
+ } else {
+ irq = pirq_entries[pin-16];
+ apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
+ pin-16, irq);
+ }
+ }
+ }
+ return irq;
+}
+
+static inline int IO_APIC_irq_trigger(int irq)
+{
+ int apic, idx, pin;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+ idx = find_irq_entry(apic,pin,mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+ return irq_trigger(idx);
+ }
+ }
+ /*
+ * nonexistent IRQs are edge default
+ */
+ return 0;
+}
+
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+
+int assign_irq_vector(int irq)
+{
+ static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+
+ BUG_ON(irq >= NR_IRQ_VECTORS);
+ if (IO_APIC_VECTOR(irq) > 0)
+ return IO_APIC_VECTOR(irq);
+next:
+ current_vector += 8;
+ if (current_vector == IA32_SYSCALL_VECTOR)
+ goto next;
+
+ if (current_vector >= FIRST_SYSTEM_VECTOR) {
+ offset++;
+ if (!(offset%8))
+ return -ENOSPC;
+ current_vector = FIRST_DEVICE_VECTOR + offset;
+ }
+
+ vector_irq[current_vector] = irq;
+ if (irq != AUTO_ASSIGN)
+ IO_APIC_VECTOR(irq) = current_vector;
+
+ return current_vector;
+}
+
+extern void (*interrupt[NR_IRQS])(void);
+static struct hw_interrupt_type ioapic_level_type;
+static struct hw_interrupt_type ioapic_edge_type;
+
+#define IOAPIC_AUTO -1
+#define IOAPIC_EDGE 0
+#define IOAPIC_LEVEL 1
+
+static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+{
+ if (use_pci_vector() && !platform_legacy_irq(irq)) {
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
+ irq_desc[vector].handler = &ioapic_level_type;
+ else
+ irq_desc[vector].handler = &ioapic_edge_type;
+ set_intr_gate(vector, interrupt[vector]);
+ } else {
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
+ irq_desc[irq].handler = &ioapic_level_type;
+ else
+ irq_desc[irq].handler = &ioapic_edge_type;
+ set_intr_gate(vector, interrupt[irq]);
+ }
+}
+
+static void __init setup_IO_APIC_irqs(void)
+{
+ struct IO_APIC_route_entry entry;
+ int apic, pin, idx, irq, first_notcon = 1, vector;
+ unsigned long flags;
+
+ apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+ /*
+ * add it to the IO-APIC irq-routing table:
+ */
+ memset(&entry,0,sizeof(entry));
+
+ entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = INT_DEST_MODE;
+ entry.mask = 0; /* enable IRQ */
+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+
+ idx = find_irq_entry(apic,pin,mp_INT);
+ if (idx == -1) {
+ if (first_notcon) {
+ apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ first_notcon = 0;
+ } else
+ apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin);
+ continue;
+ }
+
+ entry.trigger = irq_trigger(idx);
+ entry.polarity = irq_polarity(idx);
+
+ if (irq_trigger(idx)) {
+ entry.trigger = 1;
+ entry.mask = 1;
+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ }
+
+ irq = pin_2_irq(idx, apic, pin);
+ add_pin_to_irq(irq, apic, pin);
+
+ if (!apic && !IO_APIC_IRQ(irq))
+ continue;
+
+ if (IO_APIC_IRQ(irq)) {
+ vector = assign_irq_vector(irq);
+ entry.vector = vector;
+
+ ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+ if (!apic && (irq < 16))
+ disable_8259A_irq(irq);
+ }
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
+ io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+ }
+
+ if (!first_notcon)
+ apic_printk(APIC_VERBOSE," not connected.\n");
+}
+
+/*
+ * Set up the 8259A-master output pin as broadcast to all
+ * CPUs.
+ */
+static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ memset(&entry,0,sizeof(entry));
+
+ disable_8259A_irq(0);
+
+ /* mask LVT0 */
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+
+ /*
+ * We use logical delivery to get the timer IRQ
+ * to the first CPU.
+ */
+ entry.dest_mode = INT_DEST_MODE;
+ entry.mask = 0; /* unmask IRQ now */
+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.polarity = 0;
+ entry.trigger = 0;
+ entry.vector = vector;
+
+ /*
+ * The timer IRQ doesn't have to know that behind the
+ * scene we have a 8259A-master in AEOI mode ...
+ */
+ irq_desc[0].handler = &ioapic_edge_type;
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+ io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ enable_8259A_irq(0);
+}
+
+void __init UNEXPECTED_IO_APIC(void)
+{
+}
+
+void __apicdebuginit print_IO_APIC(void)
+{
+ int apic, i;
+ union IO_APIC_reg_00 reg_00;
+ union IO_APIC_reg_01 reg_01;
+ union IO_APIC_reg_02 reg_02;
+ unsigned long flags;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+ for (i = 0; i < nr_ioapics; i++)
+ printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+ mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+
+ /*
+ * We are a bit conservative about what we expect. We have to
+ * know about every hardware change ASAP.
+ */
+ printk(KERN_INFO "testing the IO APIC.......................\n");
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ reg_01.raw = io_apic_read(apic, 1);
+ if (reg_01.bits.version >= 0x10)
+ reg_02.raw = io_apic_read(apic, 2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk("\n");
+ printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+ printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+ printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
+ if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
+ UNEXPECTED_IO_APIC();
+
+ printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
+ printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
+ if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
+ (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
+ (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
+ (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
+ (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
+ (reg_01.bits.entries != 0x2E) &&
+ (reg_01.bits.entries != 0x3F) &&
+ (reg_01.bits.entries != 0x03)
+ )
+ UNEXPECTED_IO_APIC();
+
+ printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
+ printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
+ if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
+ (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */
+ (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
+ (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
+ (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
+ (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */
+ )
+ UNEXPECTED_IO_APIC();
+ if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
+ UNEXPECTED_IO_APIC();
+
+ if (reg_01.bits.version >= 0x10) {
+ printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+ printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
+ if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
+ UNEXPECTED_IO_APIC();
+ }
+
+ printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+ printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
+ " Stat Dest Deli Vect: \n");
+
+ for (i = 0; i <= reg_01.bits.entries; i++) {
+ struct IO_APIC_route_entry entry;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
+ *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ printk(KERN_DEBUG " %02x %03X %02X ",
+ i,
+ entry.dest.logical.logical_dest,
+ entry.dest.physical.physical_dest
+ );
+
+ printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
+ entry.mask,
+ entry.trigger,
+ entry.irr,
+ entry.polarity,
+ entry.delivery_status,
+ entry.dest_mode,
+ entry.delivery_mode,
+ entry.vector
+ );
+ }
+ }
+ if (use_pci_vector())
+ printk(KERN_INFO "Using vector-based indexing\n");
+ printk(KERN_DEBUG "IRQ to pin mappings:\n");
+ for (i = 0; i < NR_IRQS; i++) {
+ struct irq_pin_list *entry = irq_2_pin + i;
+ if (entry->pin < 0)
+ continue;
+ if (use_pci_vector() && !platform_legacy_irq(i))
+ printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
+ else
+ printk(KERN_DEBUG "IRQ%d ", i);
+ for (;;) {
+ printk("-> %d:%d", entry->apic, entry->pin);
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+ printk("\n");
+ }
+
+ printk(KERN_INFO ".................................... done.\n");
+
+ return;
+}
+
+#if 0
+
+static __apicdebuginit void print_APIC_bitfield (int base)
+{
+ unsigned int v;
+ int i, j;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+ for (i = 0; i < 8; i++) {
+ v = apic_read(base + i*0x10);
+ for (j = 0; j < 32; j++) {
+ if (v & (1<<j))
+ printk("1");
+ else
+ printk("0");
+ }
+ printk("\n");
+ }
+}
+
+void __apicdebuginit print_local_APIC(void * dummy)
+{
+ unsigned int v, ver, maxlvt;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
+ v = apic_read(APIC_LVR);
+ printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ v = apic_read(APIC_ARBPRI);
+ printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+ v & APIC_ARBPRI_MASK);
+ v = apic_read(APIC_PROCPRI);
+ printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+ }
+
+ v = apic_read(APIC_EOI);
+ printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+ v = apic_read(APIC_LDR);
+ printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+ v = apic_read(APIC_DFR);
+ printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+ v = apic_read(APIC_SPIV);
+ printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+ printk(KERN_DEBUG "... APIC ISR field:\n");
+ print_APIC_bitfield(APIC_ISR);
+ printk(KERN_DEBUG "... APIC TMR field:\n");
+ print_APIC_bitfield(APIC_TMR);
+ printk(KERN_DEBUG "... APIC IRR field:\n");
+ print_APIC_bitfield(APIC_IRR);
+
+ if (APIC_INTEGRATED(ver)) { /* !82489DX */
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ v = apic_read(APIC_ESR);
+ printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_ICR);
+ printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+ v = apic_read(APIC_ICR2);
+ printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+
+ v = apic_read(APIC_LVTT);
+ printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) { /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) { /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+ printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+ on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+
+void __apicdebuginit print_PIC(void)
+{
+ extern spinlock_t i8259A_lock;
+ unsigned int v;
+ unsigned long flags;
+
+ if (apic_verbosity == APIC_QUIET)
+ return;
+
+ printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+ spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
+
+ outb(0x0b,0xa0);
+ outb(0x0b,0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a,0xa0);
+ outb(0x0a,0x20);
+
+ spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
+
+ v = inb(0x4d1) << 8 | inb(0x4d0);
+ printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+
+#endif /* 0 */
+
+static void __init enable_IO_APIC(void)
+{
+ union IO_APIC_reg_01 reg_01;
+ int i;
+ unsigned long flags;
+
+ for (i = 0; i < PIN_MAP_SIZE; i++) {
+ irq_2_pin[i].pin = -1;
+ irq_2_pin[i].next = 0;
+ }
+ if (!pirqs_enabled)
+ for (i = 0; i < MAX_PIRQS; i++)
+ pirq_entries[i] = -1;
+
+ /*
+ * The number of IO-APIC IRQ registers (== #pins):
+ */
+ for (i = 0; i < nr_ioapics; i++) {
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(i, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ nr_ioapic_registers[i] = reg_01.bits.entries+1;
+ }
+
+ /*
+ * Do not trust the IO-APIC being empty at bootup
+ */
+ clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+ /*
+ * Clear the IO-APIC before rebooting:
+ */
+ clear_IO_APIC();
+
+ disconnect_bsp_APIC();
+}
+
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
+ */
+
+static void __init setup_ioapic_ids_from_mpc (void)
+{
+ union IO_APIC_reg_00 reg_00;
+ int apic;
+ int i;
+ unsigned char old_id;
+ unsigned long flags;
+
+ /*
+ * Set the IOAPIC ID to the value stored in the MPC table.
+ */
+ for (apic = 0; apic < nr_ioapics; apic++) {
+
+ /* Read the register 0 value */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ old_id = mp_ioapics[apic].mpc_apicid;
+
+
+ printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
+
+
+ /*
+ * We need to adjust the IRQ routing table
+ * if the ID changed.
+ */
+ if (old_id != mp_ioapics[apic].mpc_apicid)
+ for (i = 0; i < mp_irq_entries; i++)
+ if (mp_irqs[i].mpc_dstapic == old_id)
+ mp_irqs[i].mpc_dstapic
+ = mp_ioapics[apic].mpc_apicid;
+
+ /*
+ * Read the right value from the MPC table and
+ * write it into the ID register.
+ */
+ apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
+ mp_ioapics[apic].mpc_apicid);
+
+ reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0, reg_00.raw);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /*
+ * Sanity check
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(apic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+ printk("could not set ID!\n");
+ else
+ apic_printk(APIC_VERBOSE," ok.\n");
+ }
+}
+
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ * - timer IRQ defaults to IO-APIC IRQ
+ * - if this function detects that timer IRQs are defunct, then we fall
+ * back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+ unsigned long t1 = jiffies;
+
+ local_irq_enable();
+ /* Let ten ticks pass... */
+ mdelay((10 * 1000) / HZ);
+
+ /*
+ * Expect a few ticks at least, to be sure some possible
+ * glue logic does not lock up after one or two first
+ * ticks in a non-ExtINT mode. Also the local APIC
+ * might have cached one ExtINT interrupt. Finally, at
+ * least one tick may be lost due to delays.
+ */
+
+ /* jiffies wrap? */
+ if (jiffies - t1 > 4)
+ return 1;
+ return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+
+static unsigned int startup_edge_ioapic_irq(unsigned int irq)
+{
+ int was_pending = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ if (irq < 16) {
+ disable_8259A_irq(irq);
+ if (i8259A_irq_pending(irq))
+ was_pending = 1;
+ }
+ __unmask_IO_APIC_irq(irq);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return was_pending;
+}
+
+/*
+ * Once we have recorded IRQ_PENDING already, we can mask the
+ * interrupt for real. This prevents IRQ storms from unhandled
+ * devices.
+ */
+static void ack_edge_ioapic_irq(unsigned int irq)
+{
+ if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
+ == (IRQ_PENDING | IRQ_DISABLED))
+ mask_IO_APIC_irq(irq);
+ ack_APIC_irq();
+}
+
+/*
+ * Level triggered interrupts can just be masked,
+ * and shutting down and starting up the interrupt
+ * is the same as enabling and disabling them -- except
+ * with a startup need to return a "was pending" value.
+ *
+ * Level triggered interrupts are special because we
+ * do not touch any IO-APIC register while handling
+ * them. We ack the APIC in the end-IRQ handler, not
+ * in the start-IRQ-handler. Protection against reentrance
+ * from the same interrupt is still provided, both by the
+ * generic IRQ layer and by the fact that an unacked local
+ * APIC does not accept IRQs.
+ */
+static unsigned int startup_level_ioapic_irq (unsigned int irq)
+{
+ unmask_IO_APIC_irq(irq);
+
+ return 0; /* don't check for pending */
+}
+
+static void end_level_ioapic_irq (unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+ unsigned long flags;
+ unsigned int dest;
+
+ dest = cpu_mask_to_apicid(mask);
+
+ /*
+ * Only the high 8 bits are valid.
+ */
+ dest = SET_APIC_LOGICAL_ID(dest);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ __DO_ACTION(1, = dest, )
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+#ifdef CONFIG_PCI_MSI
+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
+{
+ int irq = vector_to_irq(vector);
+
+ return startup_edge_ioapic_irq(irq);
+}
+
+static void ack_edge_ioapic_vector(unsigned int vector)
+{
+ int irq = vector_to_irq(vector);
+
+ ack_edge_ioapic_irq(irq);
+}
+
+static unsigned int startup_level_ioapic_vector (unsigned int vector)
+{
+ int irq = vector_to_irq(vector);
+
+ return startup_level_ioapic_irq (irq);
+}
+
+static void end_level_ioapic_vector (unsigned int vector)
+{
+ int irq = vector_to_irq(vector);
+
+ end_level_ioapic_irq(irq);
+}
+
+static void mask_IO_APIC_vector (unsigned int vector)
+{
+ int irq = vector_to_irq(vector);
+
+ mask_IO_APIC_irq(irq);
+}
+
+static void unmask_IO_APIC_vector (unsigned int vector)
+{
+ int irq = vector_to_irq(vector);
+
+ unmask_IO_APIC_irq(irq);
+}
+
+static void set_ioapic_affinity_vector (unsigned int vector,
+ cpumask_t cpu_mask)
+{
+ int irq = vector_to_irq(vector);
+
+ set_ioapic_affinity_irq(irq, cpu_mask);
+}
+#endif
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+
+static struct hw_interrupt_type ioapic_edge_type = {
+ .typename = "IO-APIC-edge",
+ .startup = startup_edge_ioapic,
+ .shutdown = shutdown_edge_ioapic,
+ .enable = enable_edge_ioapic,
+ .disable = disable_edge_ioapic,
+ .ack = ack_edge_ioapic,
+ .end = end_edge_ioapic,
+ .set_affinity = set_ioapic_affinity,
+};
+
+static struct hw_interrupt_type ioapic_level_type = {
+ .typename = "IO-APIC-level",
+ .startup = startup_level_ioapic,
+ .shutdown = shutdown_level_ioapic,
+ .enable = enable_level_ioapic,
+ .disable = disable_level_ioapic,
+ .ack = mask_and_ack_level_ioapic,
+ .end = end_level_ioapic,
+ .set_affinity = set_ioapic_affinity,
+};
+
+static inline void init_IO_APIC_traps(void)
+{
+ int irq;
+
+ /*
+ * NOTE! The local APIC isn't very good at handling
+ * multiple interrupts at the same interrupt level.
+ * As the interrupt level is determined by taking the
+ * vector number and shifting that right by 4, we
+ * want to spread these out a bit so that they don't
+ * all fall in the same interrupt level.
+ *
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+ for (irq = 0; irq < NR_IRQS ; irq++) {
+ int tmp = irq;
+ if (use_pci_vector()) {
+ if (!platform_legacy_irq(tmp))
+ if ((tmp = vector_to_irq(tmp)) == -1)
+ continue;
+ }
+ if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
+ /*
+ * Hmm.. We don't have an entry for this,
+ * so default to an old-fashioned 8259
+ * interrupt if we can..
+ */
+ if (irq < 16)
+ make_8259A_irq(irq);
+ else
+ /* Strange. Oh, well.. */
+ irq_desc[irq].handler = &no_irq_type;
+ }
+ }
+}
+
+static void enable_lapic_irq (unsigned int irq)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void disable_lapic_irq (unsigned int irq)
+{
+ unsigned long v;
+
+ v = apic_read(APIC_LVT0);
+ apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq (unsigned int irq)
+{
+ ack_APIC_irq();
+}
+
+static void end_lapic_irq (unsigned int i) { /* nothing */ }
+
+static struct hw_interrupt_type lapic_irq_type = {
+ .typename = "local-APIC-edge",
+ .startup = NULL, /* startup_irq() not used for IRQ0 */
+ .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
+ .enable = enable_lapic_irq,
+ .disable = disable_lapic_irq,
+ .ack = ack_lapic_irq,
+ .end = end_lapic_irq,
+};
+
+static void setup_nmi (void)
+{
+ /*
+ * Dirty trick to enable the NMI watchdog ...
+ * We put the 8259A master into AEOI mode and
+ * unmask on all local APICs LVT0 as NMI.
+ *
+ * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+ * is from Maciej W. Rozycki - so we do not have to EOI from
+ * the NMI handler or the timer interrupt.
+ */
+ printk(KERN_INFO "activating NMI Watchdog ...");
+
+ enable_NMI_through_LVT0(NULL);
+
+ printk(" done.\n");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic. ICR does
+ * not support the ExtINT mode, unfortunately. We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA. --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+ int pin, i;
+ struct IO_APIC_route_entry entry0, entry1;
+ unsigned char save_control, save_freq_select;
+ unsigned long flags;
+
+ pin = find_isa_irq_pin(8, mp_INT);
+ if (pin == -1)
+ return;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
+ *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ clear_IO_APIC_pin(0, pin);
+
+ memset(&entry1, 0, sizeof(entry1));
+
+ entry1.dest_mode = 0; /* physical delivery */
+ entry1.mask = 0; /* unmask IRQ now */
+ entry1.dest.physical.physical_dest = hard_smp_processor_id();
+ entry1.delivery_mode = dest_ExtINT;
+ entry1.polarity = entry0.polarity;
+ entry1.trigger = 0;
+ entry1.vector = 0;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
+ io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ save_control = CMOS_READ(RTC_CONTROL);
+ save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+ CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+ RTC_FREQ_SELECT);
+ CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+ i = 100;
+ while (i-- > 0) {
+ mdelay(10);
+ if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+ i -= 10;
+ }
+
+ CMOS_WRITE(save_control, RTC_CONTROL);
+ CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+ clear_IO_APIC_pin(0, pin);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
+ io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
+ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ */
+static inline void check_timer(void)
+{
+ int pin1, pin2;
+ int vector;
+
+ /*
+ * get/set the timer IRQ vector:
+ */
+ disable_8259A_irq(0);
+ vector = assign_irq_vector(0);
+ set_intr_gate(vector, interrupt[0]);
+
+ /*
+ * Subtle, code in do_timer_interrupt() expects an AEOI
+ * mode for the 8259A whenever interrupts are routed
+ * through I/O APICs. Also IRQ0 has to be enabled in
+ * the 8259A which implies the virtual wire has to be
+ * disabled in the local APIC.
+ */
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+ init_8259A(1);
+ enable_8259A_irq(0);
+
+ pin1 = find_isa_irq_pin(0, mp_INT);
+ pin2 = find_isa_irq_pin(0, mp_ExtINT);
+
+ apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2);
+
+ if (pin1 != -1) {
+ /*
+ * Ok, does IRQ0 through the IOAPIC work?
+ */
+ unmask_IO_APIC_irq(0);
+ if (timer_irq_works()) {
+ nmi_watchdog_default();
+ if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
+ setup_nmi();
+ enable_8259A_irq(0);
+ check_nmi_watchdog();
+ }
+ return;
+ }
+ clear_IO_APIC_pin(0, pin1);
+ apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
+ }
+
+ apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... ");
+ if (pin2 != -1) {
+ apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2);
+ /*
+ * legacy devices should be connected to IO APIC #0
+ */
+ setup_ExtINT_IRQ0_pin(pin2, vector);
+ if (timer_irq_works()) {
+ printk("works.\n");
+ nmi_watchdog_default();
+ if (nmi_watchdog == NMI_IO_APIC) {
+ setup_nmi();
+ check_nmi_watchdog();
+ }
+ return;
+ }
+ /*
+ * Cleanup, just in case ...
+ */
+ clear_IO_APIC_pin(0, pin2);
+ }
+ printk(" failed.\n");
+
+ if (nmi_watchdog) {
+ printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+ nmi_watchdog = 0;
+ }
+
+ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+
+ disable_8259A_irq(0);
+ irq_desc[0].handler = &lapic_irq_type;
+ apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
+ enable_8259A_irq(0);
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_QUIET, " works.\n");
+ return;
+ }
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+ apic_printk(APIC_VERBOSE," failed.\n");
+
+ apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+
+ init_8259A(0);
+ make_8259A_irq(0);
+ apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+
+ unlock_ExtINT_logic();
+
+ if (timer_irq_works()) {
+ apic_printk(APIC_VERBOSE," works.\n");
+ return;
+ }
+ apic_printk(APIC_VERBOSE," failed :(.\n");
+ panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+}
+
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ * Linux doesn't really care, as it's not actually used
+ * for any interrupt handling anyway.
+ */
+#define PIC_IRQS (1<<2)
+
+void __init setup_IO_APIC(void)
+{
+ enable_IO_APIC();
+
+ if (acpi_ioapic)
+ io_apic_irqs = ~0; /* all IRQs go through IOAPIC */
+ else
+ io_apic_irqs = ~PIC_IRQS;
+
+ apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+
+ /*
+ * Set up the IO-APIC IRQ routing table.
+ */
+ if (!acpi_ioapic)
+ setup_ioapic_ids_from_mpc();
+ sync_Arb_IDs();
+ setup_IO_APIC_irqs();
+ init_IO_APIC_traps();
+ check_timer();
+ if (!acpi_ioapic)
+ print_IO_APIC();
+}
+
+struct sysfs_ioapic_data {
+ struct sys_device dev;
+ struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+
+static int ioapic_suspend(struct sys_device *dev, u32 state)
+{
+ struct IO_APIC_route_entry *entry;
+ struct sysfs_ioapic_data *data;
+ unsigned long flags;
+ int i;
+
+ data = container_of(dev, struct sysfs_ioapic_data, dev);
+ entry = data->entry;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
+ *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
+ *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return 0;
+}
+
+static int ioapic_resume(struct sys_device *dev)
+{
+ struct IO_APIC_route_entry *entry;
+ struct sysfs_ioapic_data *data;
+ unsigned long flags;
+ union IO_APIC_reg_00 reg_00;
+ int i;
+
+ data = container_of(dev, struct sysfs_ioapic_data, dev);
+ entry = data->entry;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(dev->id, 0);
+ if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+ reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+ io_apic_write(dev->id, 0, reg_00.raw);
+ }
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
+ io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
+ io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
+ }
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return 0;
+}
+
+static struct sysdev_class ioapic_sysdev_class = {
+ set_kset_name("ioapic"),
+ .suspend = ioapic_suspend,
+ .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_sysfs(void)
+{
+ struct sys_device * dev;
+ int i, size, error = 0;
+
+ error = sysdev_class_register(&ioapic_sysdev_class);
+ if (error)
+ return error;
+
+ for (i = 0; i < nr_ioapics; i++ ) {
+ size = sizeof(struct sys_device) + nr_ioapic_registers[i]
+ * sizeof(struct IO_APIC_route_entry);
+ mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+ if (!mp_ioapic_data[i]) {
+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+ continue;
+ }
+ memset(mp_ioapic_data[i], 0, size);
+ dev = &mp_ioapic_data[i]->dev;
+ dev->id = i;
+ dev->cls = &ioapic_sysdev_class;
+ error = sysdev_register(dev);
+ if (error) {
+ kfree(mp_ioapic_data[i]);
+ mp_ioapic_data[i] = NULL;
+ printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+ continue;
+ }
+ }
+
+ return 0;
+}
+
+device_initcall(ioapic_init_sysfs);
+
+/* --------------------------------------------------------------------------
+ ACPI-based IOAPIC Configuration
+ -------------------------------------------------------------------------- */
+
+#ifdef CONFIG_ACPI_BOOT
+
+#define IO_APIC_MAX_ID 0xFE
+
+int __init io_apic_get_unique_id (int ioapic, int apic_id)
+{
+ union IO_APIC_reg_00 reg_00;
+ static physid_mask_t apic_id_map;
+ unsigned long flags;
+ int i = 0;
+
+ /*
+ * The P4 platform supports up to 256 APIC IDs on two separate APIC
+ * buses (one for LAPICs, one for IOAPICs), where predecessors only
+ * supports up to 16 on one shared APIC bus.
+ *
+ * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+ * advantage of new APIC bus architecture.
+ */
+
+ if (physids_empty(apic_id_map))
+ apic_id_map = phys_cpu_present_map;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ if (apic_id >= IO_APIC_MAX_ID) {
+ apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+ "%d\n", ioapic, apic_id, reg_00.bits.ID);
+ apic_id = reg_00.bits.ID;
+ }
+
+ /*
+ * Every APIC in a system must have a unique ID or we get lots of nice
+ * 'stuck on smp_invalidate_needed IPI wait' messages.
+ */
+ if (physid_isset(apic_id, apic_id_map)) {
+
+ for (i = 0; i < IO_APIC_MAX_ID; i++) {
+ if (!physid_isset(i, apic_id_map))
+ break;
+ }
+
+ if (i == IO_APIC_MAX_ID)
+ panic("Max apic_id exceeded!\n");
+
+ apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+ "trying %d\n", ioapic, apic_id, i);
+
+ apic_id = i;
+ }
+
+ physid_set(apic_id, apic_id_map);
+
+ if (reg_00.bits.ID != apic_id) {
+ reg_00.bits.ID = apic_id;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(ioapic, 0);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ /* Sanity check */
+ if (reg_00.bits.ID != apic_id)
+ panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
+ }
+
+ apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+ return apic_id;
+}
+
+
+int __init io_apic_get_version (int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.version;
+}
+
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+ union IO_APIC_reg_01 reg_01;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ reg_01.raw = io_apic_read(ioapic, 1);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return reg_01.bits.entries;
+}
+
+
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+{
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ if (!IO_APIC_IRQ(irq)) {
+ apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+ ioapic);
+ return -EINVAL;
+ }
+
+ /*
+ * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
+ * Note that we mask (disable) IRQs now -- these get enabled when the
+ * corresponding device driver registers for this IRQ.
+ */
+
+ memset(&entry,0,sizeof(entry));
+
+ entry.delivery_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = INT_DEST_MODE;
+ entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+ entry.trigger = edge_level;
+ entry.polarity = active_high_low;
+ entry.mask = 1; /* Disabled (masked) */
+
+ /*
+ * IRQs < 16 are already in the irq_2_pin[] map
+ */
+ if (irq >= 16)
+ add_pin_to_irq(irq, ioapic, pin);
+
+ entry.vector = assign_irq_vector(irq);
+
+ apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> "
+ "IRQ %d Mode:%i Active:%i)\n", ioapic,
+ mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+ edge_level, active_high_low);
+
+ ioapic_register_intr(irq, entry.vector, edge_level);
+
+ if (!ioapic && (irq < 16))
+ disable_8259A_irq(irq);
+
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
+ io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+
+ return 0;
+}
+
+#endif /*CONFIG_ACPI_BOOT*/
+
+
+/*
+ * This function currently is only a helper for the i386 smp boot process where
+ * we need to reprogram the ioredtbls to cater for the cpus which have come online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+void __init setup_ioapic_dest(void)
+{
+ int pin, ioapic, irq, irq_entry;
+
+ if (skip_ioapic_setup == 1)
+ return;
+
+ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+ for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+ irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+ if (irq_entry == -1)
+ continue;
+ irq = pin_2_irq(irq_entry, ioapic, pin);
+ set_ioapic_affinity_irq(irq, TARGET_CPUS);
+ }
+
+ }
+}
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
new file mode 100644
index 000000000000..cde0e868efe3
--- /dev/null
+++ b/arch/x86_64/kernel/ioport.c
@@ -0,0 +1,117 @@
+/*
+ * linux/arch/x86_64/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value)
+{
+ int i;
+ if (new_value)
+ for (i = base; i < base + extent; i++)
+ __set_bit(i, bitmap);
+ else
+ for (i = base; i < base + extent; i++)
+ clear_bit(i, bitmap);
+}
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+ unsigned int i, max_long, bytes, bytes_updated;
+ struct thread_struct * t = &current->thread;
+ struct tss_struct * tss;
+ unsigned long *bitmap;
+
+ if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+ return -EINVAL;
+ if (turn_on && !capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ /*
+ * If it's the first ioperm() call in this thread's lifetime, set the
+ * IO bitmap up. ioperm() is much less timing critical than clone(),
+ * this is why we delay this operation until now:
+ */
+ if (!t->io_bitmap_ptr) {
+ bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+ if (!bitmap)
+ return -ENOMEM;
+
+ memset(bitmap, 0xff, IO_BITMAP_BYTES);
+ t->io_bitmap_ptr = bitmap;
+ }
+
+ /*
+ * do it in the per-thread copy and in the TSS ...
+ *
+ * Disable preemption via get_cpu() - we must not switch away
+ * because the ->io_bitmap_max value must match the bitmap
+ * contents:
+ */
+ tss = &per_cpu(init_tss, get_cpu());
+
+ set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+ /*
+ * Search for a (possibly new) maximum. This is simple and stupid,
+ * to keep it obviously correct:
+ */
+ max_long = 0;
+ for (i = 0; i < IO_BITMAP_LONGS; i++)
+ if (t->io_bitmap_ptr[i] != ~0UL)
+ max_long = i;
+
+ bytes = (max_long + 1) * sizeof(long);
+ bytes_updated = max(bytes, t->io_bitmap_max);
+
+ t->io_bitmap_max = bytes;
+
+ /* Update the TSS: */
+ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated);
+
+ put_cpu();
+
+ return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+
+asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
+{
+ unsigned int old = (regs->eflags >> 12) & 3;
+
+ if (level > 3)
+ return -EINVAL;
+ /* Trying to gain more privileges? */
+ if (level > old) {
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ }
+ regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
+ return 0;
+}
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
new file mode 100644
index 000000000000..62b112e4deb4
--- /dev/null
+++ b/arch/x86_64/kernel/irq.c
@@ -0,0 +1,108 @@
+/*
+ * linux/arch/x86_64/kernel/irq.c
+ *
+ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86_64-specific interrupt
+ * entry and irq statistics code. All the remaining irq logic is
+ * done by the generic kernel/irq/ code and in the
+ * x86_64-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/io_apic.h>
+
+atomic_t irq_err_count;
+#ifdef CONFIG_X86_IO_APIC
+#ifdef APIC_MISMATCH_DEBUG
+atomic_t irq_mis_count;
+#endif
+#endif
+
+/*
+ * Generic, controller-independent functions:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ int i = *(loff_t *) v, j;
+ struct irqaction * action;
+ unsigned long flags;
+
+ if (i == 0) {
+ seq_printf(p, " ");
+ for (j=0; j<NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "CPU%d ",j);
+ seq_putc(p, '\n');
+ }
+
+ if (i < NR_IRQS) {
+ spin_lock_irqsave(&irq_desc[i].lock, flags);
+ action = irq_desc[i].action;
+ if (!action)
+ goto skip;
+ seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+ seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+ for (j=0; j<NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ",
+ kstat_cpu(j).irqs[i]);
+#endif
+ seq_printf(p, " %14s", irq_desc[i].handler->typename);
+
+ seq_printf(p, " %s", action->name);
+ for (action=action->next; action; action = action->next)
+ seq_printf(p, ", %s", action->name);
+ seq_putc(p, '\n');
+skip:
+ spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+ } else if (i == NR_IRQS) {
+ seq_printf(p, "NMI: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
+ seq_putc(p, '\n');
+#ifdef CONFIG_X86_LOCAL_APIC
+ seq_printf(p, "LOC: ");
+ for (j = 0; j < NR_CPUS; j++)
+ if (cpu_online(j))
+ seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
+ seq_putc(p, '\n');
+#endif
+ seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#ifdef CONFIG_X86_IO_APIC
+#ifdef APIC_MISMATCH_DEBUG
+ seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+#endif
+ }
+ return 0;
+}
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+{
+ /* high bits used in ret_from_ code */
+ unsigned irq = regs->orig_rax & 0xff;
+
+ irq_enter();
+ BUG_ON(irq > 256);
+
+ __do_IRQ(irq, regs);
+ irq_exit();
+
+ return 1;
+}
+
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
new file mode 100644
index 000000000000..4f2a852299b6
--- /dev/null
+++ b/arch/x86_64/kernel/kprobes.c
@@ -0,0 +1,631 @@
+/*
+ * Kernel Probes (KProbes)
+ * arch/x86_64/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
+ * Probes initial implementation ( includes contributions from
+ * Rusty Russell).
+ * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
+ * interface to access function arguments.
+ * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> adapted for x86_64
+ * 2005-Mar Roland McGrath <roland@redhat.com>
+ * Fixed to handle %rip-relative addressing mode correctly.
+ */
+
+#include <linux/config.h>
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/moduleloader.h>
+
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+
+static DECLARE_MUTEX(kprobe_mutex);
+
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE 0x00000001
+#define KPROBE_HIT_SS 0x00000002
+
+static struct kprobe *current_kprobe;
+static unsigned long kprobe_status, kprobe_old_rflags, kprobe_saved_rflags;
+static struct pt_regs jprobe_saved_regs;
+static long *jprobe_saved_rsp;
+static kprobe_opcode_t *get_insn_slot(void);
+static void free_insn_slot(kprobe_opcode_t *slot);
+void jprobe_return_end(void);
+
+/* copy of the kernel stack at the probe fire time */
+static kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
+
+/*
+ * returns non-zero if opcode modifies the interrupt flag.
+ */
+static inline int is_IF_modifier(kprobe_opcode_t *insn)
+{
+ switch (*insn) {
+ case 0xfa: /* cli */
+ case 0xfb: /* sti */
+ case 0xcf: /* iret/iretd */
+ case 0x9d: /* popf/popfd */
+ return 1;
+ }
+
+ if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf)
+ return 1;
+ return 0;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+ /* insn: must be on special executable page on x86_64. */
+ up(&kprobe_mutex);
+ p->ainsn.insn = get_insn_slot();
+ down(&kprobe_mutex);
+ if (!p->ainsn.insn) {
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/*
+ * Determine if the instruction uses the %rip-relative addressing mode.
+ * If it does, return the address of the 32-bit displacement word.
+ * If not, return null.
+ */
+static inline s32 *is_riprel(u8 *insn)
+{
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 64))
+ static const u64 onebyte_has_modrm[256 / 64] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ------------------------------- */
+ W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */
+ W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */
+ W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */
+ W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */
+ W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */
+ W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */
+ W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */
+ W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */
+ W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */
+ W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */
+ W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */
+ W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */
+ W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */
+ W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */
+ W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */
+ W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */
+ /* ------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ };
+ static const u64 twobyte_has_modrm[256 / 64] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ------------------------------- */
+ W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */
+ W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */
+ W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */
+ W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */
+ W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */
+ W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */
+ W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */
+ W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */
+ W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */
+ W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */
+ W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */
+ W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */
+ W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */
+ W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */
+ W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */
+ W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */
+ /* ------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ };
+#undef W
+ int need_modrm;
+
+ /* Skip legacy instruction prefixes. */
+ while (1) {
+ switch (*insn) {
+ case 0x66:
+ case 0x67:
+ case 0x2e:
+ case 0x3e:
+ case 0x26:
+ case 0x64:
+ case 0x65:
+ case 0x36:
+ case 0xf0:
+ case 0xf3:
+ case 0xf2:
+ ++insn;
+ continue;
+ }
+ break;
+ }
+
+ /* Skip REX instruction prefix. */
+ if ((*insn & 0xf0) == 0x40)
+ ++insn;
+
+ if (*insn == 0x0f) { /* Two-byte opcode. */
+ ++insn;
+ need_modrm = test_bit(*insn, twobyte_has_modrm);
+ } else { /* One-byte opcode. */
+ need_modrm = test_bit(*insn, onebyte_has_modrm);
+ }
+
+ if (need_modrm) {
+ u8 modrm = *++insn;
+ if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */
+ /* Displacement follows ModRM byte. */
+ return (s32 *) ++insn;
+ }
+ }
+
+ /* No %rip-relative addressing mode here. */
+ return NULL;
+}
+
+void arch_copy_kprobe(struct kprobe *p)
+{
+ s32 *ripdisp;
+ memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE);
+ ripdisp = is_riprel(p->ainsn.insn);
+ if (ripdisp) {
+ /*
+ * The copied instruction uses the %rip-relative
+ * addressing mode. Adjust the displacement for the
+ * difference between the original location of this
+ * instruction and the location of the copy that will
+ * actually be run. The tricky bit here is making sure
+ * that the sign extension happens correctly in this
+ * calculation, since we need a signed 32-bit result to
+ * be sign-extended to 64 bits when it's added to the
+ * %rip value and yield the same 64-bit result that the
+ * sign-extension of the original signed 32-bit
+ * displacement would have given.
+ */
+ s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn;
+ BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
+ *ripdisp = disp;
+ }
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+ up(&kprobe_mutex);
+ free_insn_slot(p->ainsn.insn);
+ down(&kprobe_mutex);
+}
+
+static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ *p->addr = p->opcode;
+ regs->rip = (unsigned long)p->addr;
+}
+
+static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+{
+ regs->eflags |= TF_MASK;
+ regs->eflags &= ~IF_MASK;
+ /*single step inline if the instruction is an int3*/
+ if (p->opcode == BREAKPOINT_INSTRUCTION)
+ regs->rip = (unsigned long)p->addr;
+ else
+ regs->rip = (unsigned long)p->ainsn.insn;
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+int kprobe_handler(struct pt_regs *regs)
+{
+ struct kprobe *p;
+ int ret = 0;
+ kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
+
+ /* We're in an interrupt, but this is clear and BUG()-safe. */
+ preempt_disable();
+
+ /* Check we're not actually recursing */
+ if (kprobe_running()) {
+ /* We *are* holding lock here, so this is safe.
+ Disarm the probe we just hit, and ignore it. */
+ p = get_kprobe(addr);
+ if (p) {
+ if (kprobe_status == KPROBE_HIT_SS) {
+ regs->eflags &= ~TF_MASK;
+ regs->eflags |= kprobe_saved_rflags;
+ unlock_kprobes();
+ goto no_kprobe;
+ }
+ disarm_kprobe(p, regs);
+ ret = 1;
+ } else {
+ p = current_kprobe;
+ if (p->break_handler && p->break_handler(p, regs)) {
+ goto ss_probe;
+ }
+ }
+ /* If it's not ours, can't be delete race, (we hold lock). */
+ goto no_kprobe;
+ }
+
+ lock_kprobes();
+ p = get_kprobe(addr);
+ if (!p) {
+ unlock_kprobes();
+ if (*addr != BREAKPOINT_INSTRUCTION) {
+ /*
+ * The breakpoint instruction was removed right
+ * after we hit it. Another cpu has removed
+ * either a probepoint or a debugger breakpoint
+ * at this address. In either case, no further
+ * handling of this interrupt is appropriate.
+ */
+ ret = 1;
+ }
+ /* Not one of ours: let kernel handle it */
+ goto no_kprobe;
+ }
+
+ kprobe_status = KPROBE_HIT_ACTIVE;
+ current_kprobe = p;
+ kprobe_saved_rflags = kprobe_old_rflags
+ = (regs->eflags & (TF_MASK | IF_MASK));
+ if (is_IF_modifier(p->ainsn.insn))
+ kprobe_saved_rflags &= ~IF_MASK;
+
+ if (p->pre_handler && p->pre_handler(p, regs))
+ /* handler has already set things up, so skip ss setup */
+ return 1;
+
+ss_probe:
+ prepare_singlestep(p, regs);
+ kprobe_status = KPROBE_HIT_SS;
+ return 1;
+
+no_kprobe:
+ preempt_enable_no_resched();
+ return ret;
+}
+
+/*
+ * Called after single-stepping. p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int 3"
+ * instruction. To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction. The address of this
+ * copy is p->ainsn.insn.
+ *
+ * This function prepares to return from the post-single-step
+ * interrupt. We have to fix up the stack as follows:
+ *
+ * 0) Except in the case of absolute or indirect jump or call instructions,
+ * the new rip is relative to the copied instruction. We need to make
+ * it relative to the original instruction.
+ *
+ * 1) If the single-stepped instruction was pushfl, then the TF and IF
+ * flags are set in the just-pushed eflags, and may need to be cleared.
+ *
+ * 2) If the single-stepped instruction was a call, the return address
+ * that is atop the stack is the address following the copied instruction.
+ * We need to make it the address following the original instruction.
+ */
+static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long *tos = (unsigned long *)regs->rsp;
+ unsigned long next_rip = 0;
+ unsigned long copy_rip = (unsigned long)p->ainsn.insn;
+ unsigned long orig_rip = (unsigned long)p->addr;
+ kprobe_opcode_t *insn = p->ainsn.insn;
+
+ /*skip the REX prefix*/
+ if (*insn >= 0x40 && *insn <= 0x4f)
+ insn++;
+
+ switch (*insn) {
+ case 0x9c: /* pushfl */
+ *tos &= ~(TF_MASK | IF_MASK);
+ *tos |= kprobe_old_rflags;
+ break;
+ case 0xe8: /* call relative - Fix return addr */
+ *tos = orig_rip + (*tos - copy_rip);
+ break;
+ case 0xff:
+ if ((*insn & 0x30) == 0x10) {
+ /* call absolute, indirect */
+ /* Fix return addr; rip is correct. */
+ next_rip = regs->rip;
+ *tos = orig_rip + (*tos - copy_rip);
+ } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */
+ ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */
+ /* rip is correct. */
+ next_rip = regs->rip;
+ }
+ break;
+ case 0xea: /* jmp absolute -- rip is correct */
+ next_rip = regs->rip;
+ break;
+ default:
+ break;
+ }
+
+ regs->eflags &= ~TF_MASK;
+ if (next_rip) {
+ regs->rip = next_rip;
+ } else {
+ regs->rip = orig_rip + (regs->rip - copy_rip);
+ }
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thoroughout this function. And we hold kprobe lock.
+ */
+int post_kprobe_handler(struct pt_regs *regs)
+{
+ if (!kprobe_running())
+ return 0;
+
+ if (current_kprobe->post_handler)
+ current_kprobe->post_handler(current_kprobe, regs, 0);
+
+ resume_execution(current_kprobe, regs);
+ regs->eflags |= kprobe_saved_rflags;
+
+ unlock_kprobes();
+ preempt_enable_no_resched();
+
+ /*
+ * if somebody else is singlestepping across a probe point, eflags
+ * will have TF set, in which case, continue the remaining processing
+ * of do_debug, as if this is not a probe hit.
+ */
+ if (regs->eflags & TF_MASK)
+ return 0;
+
+ return 1;
+}
+
+/* Interrupts disabled, kprobe_lock held. */
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ if (current_kprobe->fault_handler
+ && current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+ return 1;
+
+ if (kprobe_status & KPROBE_HIT_SS) {
+ resume_execution(current_kprobe, regs);
+ regs->eflags |= kprobe_old_rflags;
+
+ unlock_kprobes();
+ preempt_enable_no_resched();
+ }
+ return 0;
+}
+
+/*
+ * Wrapper routine for handling exceptions.
+ */
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+ void *data)
+{
+ struct die_args *args = (struct die_args *)data;
+ switch (val) {
+ case DIE_INT3:
+ if (kprobe_handler(args->regs))
+ return NOTIFY_STOP;
+ break;
+ case DIE_DEBUG:
+ if (post_kprobe_handler(args->regs))
+ return NOTIFY_STOP;
+ break;
+ case DIE_GPF:
+ if (kprobe_running() &&
+ kprobe_fault_handler(args->regs, args->trapnr))
+ return NOTIFY_STOP;
+ break;
+ case DIE_PAGE_FAULT:
+ if (kprobe_running() &&
+ kprobe_fault_handler(args->regs, args->trapnr))
+ return NOTIFY_STOP;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct jprobe *jp = container_of(p, struct jprobe, kp);
+ unsigned long addr;
+
+ jprobe_saved_regs = *regs;
+ jprobe_saved_rsp = (long *) regs->rsp;
+ addr = (unsigned long)jprobe_saved_rsp;
+ /*
+ * As Linus pointed out, gcc assumes that the callee
+ * owns the argument space and could overwrite it, e.g.
+ * tailcall optimization. So, to be absolutely safe
+ * we also save and restore enough stack bytes to cover
+ * the argument area.
+ */
+ memcpy(jprobes_stack, (kprobe_opcode_t *) addr, MIN_STACK_SIZE(addr));
+ regs->eflags &= ~IF_MASK;
+ regs->rip = (unsigned long)(jp->entry);
+ return 1;
+}
+
+void jprobe_return(void)
+{
+ preempt_enable_no_resched();
+ asm volatile (" xchg %%rbx,%%rsp \n"
+ " int3 \n"
+ " .globl jprobe_return_end \n"
+ " jprobe_return_end: \n"
+ " nop \n"::"b"
+ (jprobe_saved_rsp):"memory");
+}
+
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ u8 *addr = (u8 *) (regs->rip - 1);
+ unsigned long stack_addr = (unsigned long)jprobe_saved_rsp;
+ struct jprobe *jp = container_of(p, struct jprobe, kp);
+
+ if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) {
+ if ((long *)regs->rsp != jprobe_saved_rsp) {
+ struct pt_regs *saved_regs =
+ container_of(jprobe_saved_rsp, struct pt_regs, rsp);
+ printk("current rsp %p does not match saved rsp %p\n",
+ (long *)regs->rsp, jprobe_saved_rsp);
+ printk("Saved registers for jprobe %p\n", jp);
+ show_registers(saved_regs);
+ printk("Current registers\n");
+ show_registers(regs);
+ BUG();
+ }
+ *regs = jprobe_saved_regs;
+ memcpy((kprobe_opcode_t *) stack_addr, jprobes_stack,
+ MIN_STACK_SIZE(stack_addr));
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped.
+ * By default on x86_64, pages we get from kmalloc or vmalloc are not
+ * executable. Single-stepping an instruction on such a page yields an
+ * oops. So instead of storing the instruction copies in their respective
+ * kprobe objects, we allocate a page, map it executable, and store all the
+ * instruction copies there. (We can allocate additional pages if somebody
+ * inserts a huge number of probes.) Each page can hold up to INSNS_PER_PAGE
+ * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t)
+ * bytes.
+ */
+#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t)))
+struct kprobe_insn_page {
+ struct hlist_node hlist;
+ kprobe_opcode_t *insns; /* page of instruction slots */
+ char slot_used[INSNS_PER_PAGE];
+ int nused;
+};
+
+static struct hlist_head kprobe_insn_pages;
+
+/**
+ * get_insn_slot() - Find a slot on an executable page for an instruction.
+ * We allocate an executable page if there's no room on existing ones.
+ */
+static kprobe_opcode_t *get_insn_slot(void)
+{
+ struct kprobe_insn_page *kip;
+ struct hlist_node *pos;
+
+ hlist_for_each(pos, &kprobe_insn_pages) {
+ kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ if (kip->nused < INSNS_PER_PAGE) {
+ int i;
+ for (i = 0; i < INSNS_PER_PAGE; i++) {
+ if (!kip->slot_used[i]) {
+ kip->slot_used[i] = 1;
+ kip->nused++;
+ return kip->insns + (i*MAX_INSN_SIZE);
+ }
+ }
+ /* Surprise! No unused slots. Fix kip->nused. */
+ kip->nused = INSNS_PER_PAGE;
+ }
+ }
+
+ /* All out of space. Need to allocate a new page. Use slot 0.*/
+ kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+ if (!kip) {
+ return NULL;
+ }
+
+ /*
+ * For the %rip-relative displacement fixups to be doable, we
+ * need our instruction copy to be within +/- 2GB of any data it
+ * might access via %rip. That is, within 2GB of where the
+ * kernel image and loaded module images reside. So we allocate
+ * a page in the module loading area.
+ */
+ kip->insns = module_alloc(PAGE_SIZE);
+ if (!kip->insns) {
+ kfree(kip);
+ return NULL;
+ }
+ INIT_HLIST_NODE(&kip->hlist);
+ hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+ memset(kip->slot_used, 0, INSNS_PER_PAGE);
+ kip->slot_used[0] = 1;
+ kip->nused = 1;
+ return kip->insns;
+}
+
+/**
+ * free_insn_slot() - Free instruction slot obtained from get_insn_slot().
+ */
+static void free_insn_slot(kprobe_opcode_t *slot)
+{
+ struct kprobe_insn_page *kip;
+ struct hlist_node *pos;
+
+ hlist_for_each(pos, &kprobe_insn_pages) {
+ kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ if (kip->insns <= slot
+ && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) {
+ int i = (slot - kip->insns) / MAX_INSN_SIZE;
+ kip->slot_used[i] = 0;
+ kip->nused--;
+ if (kip->nused == 0) {
+ /*
+ * Page is no longer in use. Free it unless
+ * it's the last one. We keep the last one
+ * so as not to have to set it up again the
+ * next time somebody inserts a probe.
+ */
+ hlist_del(&kip->hlist);
+ if (hlist_empty(&kprobe_insn_pages)) {
+ INIT_HLIST_NODE(&kip->hlist);
+ hlist_add_head(&kip->hlist,
+ &kprobe_insn_pages);
+ } else {
+ module_free(NULL, kip->insns);
+ kfree(kip);
+ }
+ }
+ return;
+ }
+ }
+}
diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c
new file mode 100644
index 000000000000..d7e5d0cf4285
--- /dev/null
+++ b/arch/x86_64/kernel/ldt.c
@@ -0,0 +1,253 @@
+/*
+ * linux/arch/x86_64/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2002 Andi Kleen
+ *
+ * This handles calls from both 32bit and 64bit mode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/proto.h>
+
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+ if (current->active_mm)
+ load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload)
+{
+ void *oldldt;
+ void *newldt;
+ unsigned oldsize;
+
+ if (mincount <= (unsigned)pc->size)
+ return 0;
+ oldsize = pc->size;
+ mincount = (mincount+511)&(~511);
+ if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+ newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+ else
+ newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+
+ if (!newldt)
+ return -ENOMEM;
+
+ if (oldsize)
+ memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+ oldldt = pc->ldt;
+ memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE);
+ wmb();
+ pc->ldt = newldt;
+ wmb();
+ pc->size = mincount;
+ wmb();
+ if (reload) {
+#ifdef CONFIG_SMP
+ cpumask_t mask;
+
+ preempt_disable();
+ mask = cpumask_of_cpu(smp_processor_id());
+ load_LDT(pc);
+ if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+ smp_call_function(flush_ldt, NULL, 1, 1);
+ preempt_enable();
+#else
+ load_LDT(pc);
+#endif
+ }
+ if (oldsize) {
+ if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(oldldt);
+ else
+ kfree(oldldt);
+ }
+ return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+ int err = alloc_ldt(new, old->size, 0);
+ if (err < 0)
+ return err;
+ memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+ return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+ struct mm_struct * old_mm;
+ int retval = 0;
+
+ init_MUTEX(&mm->context.sem);
+ mm->context.size = 0;
+ old_mm = current->mm;
+ if (old_mm && old_mm->context.size > 0) {
+ down(&old_mm->context.sem);
+ retval = copy_ldt(&mm->context, &old_mm->context);
+ up(&old_mm->context.sem);
+ }
+ return retval;
+}
+
+/*
+ *
+ * Don't touch the LDT register - we're already in the next thread.
+ */
+void destroy_context(struct mm_struct *mm)
+{
+ if (mm->context.size) {
+ if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(mm->context.ldt);
+ else
+ kfree(mm->context.ldt);
+ mm->context.size = 0;
+ }
+}
+
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+ int err;
+ unsigned long size;
+ struct mm_struct * mm = current->mm;
+
+ if (!mm->context.size)
+ return 0;
+ if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+ bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+
+ down(&mm->context.sem);
+ size = mm->context.size*LDT_ENTRY_SIZE;
+ if (size > bytecount)
+ size = bytecount;
+
+ err = 0;
+ if (copy_to_user(ptr, mm->context.ldt, size))
+ err = -EFAULT;
+ up(&mm->context.sem);
+ if (err < 0)
+ goto error_return;
+ if (size != bytecount) {
+ /* zero-fill the rest */
+ if (clear_user(ptr+size, bytecount-size) != 0) {
+ err = -EFAULT;
+ goto error_return;
+ }
+ }
+ return bytecount;
+error_return:
+ return err;
+}
+
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+ /* Arbitrary number */
+ /* x86-64 default LDT is all zeros */
+ if (bytecount > 128)
+ bytecount = 128;
+ if (clear_user(ptr, bytecount))
+ return -EFAULT;
+ return bytecount;
+}
+
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+ struct task_struct *me = current;
+ struct mm_struct * mm = me->mm;
+ __u32 entry_1, entry_2, *lp;
+ int error;
+ struct user_desc ldt_info;
+
+ error = -EINVAL;
+
+ if (bytecount != sizeof(ldt_info))
+ goto out;
+ error = -EFAULT;
+ if (copy_from_user(&ldt_info, ptr, bytecount))
+ goto out;
+
+ error = -EINVAL;
+ if (ldt_info.entry_number >= LDT_ENTRIES)
+ goto out;
+ if (ldt_info.contents == 3) {
+ if (oldmode)
+ goto out;
+ if (ldt_info.seg_not_present == 0)
+ goto out;
+ }
+
+ down(&mm->context.sem);
+ if (ldt_info.entry_number >= (unsigned)mm->context.size) {
+ error = alloc_ldt(&current->mm->context, ldt_info.entry_number+1, 1);
+ if (error < 0)
+ goto out_unlock;
+ }
+
+ lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
+
+ /* Allow LDTs to be cleared by the user. */
+ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+ if (oldmode || LDT_empty(&ldt_info)) {
+ entry_1 = 0;
+ entry_2 = 0;
+ goto install;
+ }
+ }
+
+ entry_1 = LDT_entry_a(&ldt_info);
+ entry_2 = LDT_entry_b(&ldt_info);
+ if (oldmode)
+ entry_2 &= ~(1 << 20);
+
+ /* Install the new entry ... */
+install:
+ *lp = entry_1;
+ *(lp+1) = entry_2;
+ error = 0;
+
+out_unlock:
+ up(&mm->context.sem);
+out:
+ return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+{
+ int ret = -ENOSYS;
+
+ switch (func) {
+ case 0:
+ ret = read_ldt(ptr, bytecount);
+ break;
+ case 1:
+ ret = write_ldt(ptr, bytecount, 1);
+ break;
+ case 2:
+ ret = read_default_ldt(ptr, bytecount);
+ break;
+ case 0x11:
+ ret = write_ldt(ptr, bytecount, 0);
+ break;
+ }
+ return ret;
+}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
new file mode 100644
index 000000000000..86f9fd85016a
--- /dev/null
+++ b/arch/x86_64/kernel/mce.c
@@ -0,0 +1,548 @@
+/*
+ * Machine check handler.
+ * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Rest from unknown author(s).
+ * 2004 Andi Kleen. Rewrote most of it.
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/sysdev.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/kdebug.h>
+#include <asm/uaccess.h>
+
+#define MISC_MCELOG_MINOR 227
+#define NR_BANKS 5
+
+static int mce_dont_init;
+
+/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
+ 3: never panic or exit (for testing only) */
+static int tolerant = 1;
+static int banks;
+static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
+static unsigned long console_logged;
+static int notify_user;
+
+/*
+ * Lockless MCE logging infrastructure.
+ * This avoids deadlocks on printk locks without having to break locks. Also
+ * separate MCEs from kernel messages to avoid bogus bug reports.
+ */
+
+struct mce_log mcelog = {
+ MCE_LOG_SIGNATURE,
+ MCE_LOG_LEN,
+};
+
+void mce_log(struct mce *mce)
+{
+ unsigned next, entry;
+ mce->finished = 0;
+ smp_wmb();
+ for (;;) {
+ entry = rcu_dereference(mcelog.next);
+ /* When the buffer fills up discard new entries. Assume
+ that the earlier errors are the more interesting. */
+ if (entry >= MCE_LOG_LEN) {
+ set_bit(MCE_OVERFLOW, &mcelog.flags);
+ return;
+ }
+ /* Old left over entry. Skip. */
+ if (mcelog.entry[entry].finished)
+ continue;
+ smp_rmb();
+ next = entry + 1;
+ if (cmpxchg(&mcelog.next, entry, next) == entry)
+ break;
+ }
+ memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
+ smp_wmb();
+ mcelog.entry[entry].finished = 1;
+ smp_wmb();
+
+ if (!test_and_set_bit(0, &console_logged))
+ notify_user = 1;
+}
+
+static void print_mce(struct mce *m)
+{
+ printk(KERN_EMERG "\n"
+ KERN_EMERG
+ "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ m->cpu, m->mcgstatus, m->bank, m->status);
+ if (m->rip) {
+ printk(KERN_EMERG
+ "RIP%s %02x:<%016Lx> ",
+ !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
+ m->cs, m->rip);
+ if (m->cs == __KERNEL_CS)
+ print_symbol("{%s}", m->rip);
+ printk("\n");
+ }
+ printk(KERN_EMERG "TSC %Lx ", m->tsc);
+ if (m->addr)
+ printk("ADDR %Lx ", m->addr);
+ if (m->misc)
+ printk("MISC %Lx ", m->misc);
+ printk("\n");
+}
+
+static void mce_panic(char *msg, struct mce *backup, unsigned long start)
+{
+ int i;
+ oops_begin();
+ for (i = 0; i < MCE_LOG_LEN; i++) {
+ unsigned long tsc = mcelog.entry[i].tsc;
+ if (time_before(tsc, start))
+ continue;
+ print_mce(&mcelog.entry[i]);
+ if (backup && mcelog.entry[i].tsc == backup->tsc)
+ backup = NULL;
+ }
+ if (backup)
+ print_mce(backup);
+ if (tolerant >= 3)
+ printk("Fake panic: %s\n", msg);
+ else
+ panic(msg);
+}
+
+static int mce_available(struct cpuinfo_x86 *c)
+{
+ return test_bit(X86_FEATURE_MCE, &c->x86_capability) &&
+ test_bit(X86_FEATURE_MCA, &c->x86_capability);
+}
+
+/*
+ * The actual machine check handler
+ */
+
+void do_machine_check(struct pt_regs * regs, long error_code)
+{
+ struct mce m, panicm;
+ int nowayout = (tolerant < 1);
+ int kill_it = 0;
+ u64 mcestart = 0;
+ int i;
+ int panicm_found = 0;
+
+ if (regs)
+ notify_die(DIE_NMI, "machine check", regs, error_code, 255, SIGKILL);
+ if (!banks)
+ return;
+
+ memset(&m, 0, sizeof(struct mce));
+ m.cpu = hard_smp_processor_id();
+ rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ kill_it = 1;
+
+ rdtscll(mcestart);
+ barrier();
+
+ for (i = 0; i < banks; i++) {
+ if (!bank[i])
+ continue;
+
+ m.misc = 0;
+ m.addr = 0;
+ m.bank = i;
+ m.tsc = 0;
+
+ rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+ if ((m.status & MCI_STATUS_VAL) == 0)
+ continue;
+
+ if (m.status & MCI_STATUS_EN) {
+ /* In theory _OVER could be a nowayout too, but
+ assume any overflowed errors were no fatal. */
+ nowayout |= !!(m.status & MCI_STATUS_PCC);
+ kill_it |= !!(m.status & MCI_STATUS_UC);
+ }
+
+ if (m.status & MCI_STATUS_MISCV)
+ rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+ if (m.status & MCI_STATUS_ADDRV)
+ rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+ if (regs && (m.mcgstatus & MCG_STATUS_RIPV)) {
+ m.rip = regs->rip;
+ m.cs = regs->cs;
+ } else {
+ m.rip = 0;
+ m.cs = 0;
+ }
+
+ if (error_code != -1)
+ rdtscll(m.tsc);
+ wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
+ mce_log(&m);
+
+ /* Did this bank cause the exception? */
+ /* Assume that the bank with uncorrectable errors did it,
+ and that there is only a single one. */
+ if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
+ panicm = m;
+ panicm_found = 1;
+ }
+
+ tainted |= TAINT_MACHINE_CHECK;
+ }
+
+ /* Never do anything final in the polling timer */
+ if (!regs)
+ goto out;
+
+ /* If we didn't find an uncorrectable error, pick
+ the last one (shouldn't happen, just being safe). */
+ if (!panicm_found)
+ panicm = m;
+ if (nowayout)
+ mce_panic("Machine check", &panicm, mcestart);
+ if (kill_it) {
+ int user_space = 0;
+
+ if (m.mcgstatus & MCG_STATUS_RIPV)
+ user_space = panicm.rip && (panicm.cs & 3);
+
+ /* When the machine was in user space and the CPU didn't get
+ confused it's normally not necessary to panic, unless you
+ are paranoid (tolerant == 0)
+
+ RED-PEN could be more tolerant for MCEs in idle,
+ but most likely they occur at boot anyways, where
+ it is best to just halt the machine. */
+ if ((!user_space && (panic_on_oops || tolerant < 2)) ||
+ (unsigned)current->pid <= 1)
+ mce_panic("Uncorrected machine check", &panicm, mcestart);
+
+ /* do_exit takes an awful lot of locks and has as
+ slight risk of deadlocking. If you don't want that
+ don't set tolerant >= 2 */
+ if (tolerant < 3)
+ do_exit(SIGBUS);
+ }
+
+ out:
+ /* Last thing done in the machine check exception to clear state. */
+ wrmsrl(MSR_IA32_MCG_STATUS, 0);
+}
+
+/*
+ * Periodic polling timer for "silent" machine check errors.
+ */
+
+static int check_interval = 5 * 60; /* 5 minutes */
+static void mcheck_timer(void *data);
+static DECLARE_WORK(mcheck_work, mcheck_timer, NULL);
+
+static void mcheck_check_cpu(void *info)
+{
+ if (mce_available(&current_cpu_data))
+ do_machine_check(NULL, 0);
+}
+
+static void mcheck_timer(void *data)
+{
+ on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+ schedule_delayed_work(&mcheck_work, check_interval * HZ);
+
+ /*
+ * It's ok to read stale data here for notify_user and
+ * console_logged as we'll simply get the updated versions
+ * on the next mcheck_timer execution and atomic operations
+ * on console_logged act as synchronization for notify_user
+ * writes.
+ */
+ if (notify_user && console_logged) {
+ notify_user = 0;
+ clear_bit(0, &console_logged);
+ printk(KERN_INFO "Machine check events logged\n");
+ }
+}
+
+
+static __init int periodic_mcheck_init(void)
+{
+ if (check_interval)
+ schedule_delayed_work(&mcheck_work, check_interval*HZ);
+ return 0;
+}
+__initcall(periodic_mcheck_init);
+
+
+/*
+ * Initialize Machine Checks for a CPU.
+ */
+static void mce_init(void *dummy)
+{
+ u64 cap;
+ int i;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ banks = cap & 0xff;
+ if (banks > NR_BANKS) {
+ printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
+ banks = NR_BANKS;
+ }
+
+ /* Log the machine checks left over from the previous reset.
+ This also clears all registers */
+ do_machine_check(NULL, -1);
+
+ set_in_cr4(X86_CR4_MCE);
+
+ if (cap & MCG_CTL_P)
+ wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+ for (i = 0; i < banks; i++) {
+ wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
+ wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+ }
+}
+
+/* Add per CPU specific workarounds here */
+static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
+{
+ /* This should be disabled by the BIOS, but isn't always */
+ if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
+ /* disable GART TBL walk error reporting, which trips off
+ incorrectly with the IOMMU & 3ware & Cerberus. */
+ clear_bit(10, &bank[4]);
+ }
+}
+
+static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_init(c);
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * Called for each booted CPU to set up machine checks.
+ * Must be called with preempt off.
+ */
+void __init mcheck_init(struct cpuinfo_x86 *c)
+{
+ static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
+
+ mce_cpu_quirks(c);
+
+ if (mce_dont_init ||
+ cpu_test_and_set(smp_processor_id(), mce_cpus) ||
+ !mce_available(c))
+ return;
+
+ mce_init(NULL);
+ mce_cpu_features(c);
+}
+
+/*
+ * Character device to read and clear the MCE log.
+ */
+
+static void collect_tscs(void *data)
+{
+ unsigned long *cpu_tsc = (unsigned long *)data;
+ rdtscll(cpu_tsc[smp_processor_id()]);
+}
+
+static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
+{
+ unsigned long cpu_tsc[NR_CPUS];
+ static DECLARE_MUTEX(mce_read_sem);
+ unsigned next;
+ char __user *buf = ubuf;
+ int i, err;
+
+ down(&mce_read_sem);
+ next = rcu_dereference(mcelog.next);
+
+ /* Only supports full reads right now */
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
+ up(&mce_read_sem);
+ return -EINVAL;
+ }
+
+ err = 0;
+ for (i = 0; i < next; i++) {
+ if (!mcelog.entry[i].finished)
+ continue;
+ smp_rmb();
+ err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
+ buf += sizeof(struct mce);
+ }
+
+ memset(mcelog.entry, 0, next * sizeof(struct mce));
+ mcelog.next = 0;
+
+ synchronize_kernel();
+
+ /* Collect entries that were still getting written before the synchronize. */
+
+ on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+ for (i = next; i < MCE_LOG_LEN; i++) {
+ if (mcelog.entry[i].finished &&
+ mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
+ err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
+ smp_rmb();
+ buf += sizeof(struct mce);
+ memset(&mcelog.entry[i], 0, sizeof(struct mce));
+ }
+ }
+ up(&mce_read_sem);
+ return err ? -EFAULT : buf - ubuf;
+}
+
+static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
+{
+ int __user *p = (int __user *)arg;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ switch (cmd) {
+ case MCE_GET_RECORD_LEN:
+ return put_user(sizeof(struct mce), p);
+ case MCE_GET_LOG_LEN:
+ return put_user(MCE_LOG_LEN, p);
+ case MCE_GETCLEAR_FLAGS: {
+ unsigned flags;
+ do {
+ flags = mcelog.flags;
+ } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+ return put_user(flags, p);
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+static struct file_operations mce_chrdev_ops = {
+ .read = mce_read,
+ .ioctl = mce_ioctl,
+};
+
+static struct miscdevice mce_log_device = {
+ MISC_MCELOG_MINOR,
+ "mcelog",
+ &mce_chrdev_ops,
+};
+
+/*
+ * Old style boot options parsing. Only for compatibility.
+ */
+
+static int __init mcheck_disable(char *str)
+{
+ mce_dont_init = 1;
+ return 0;
+}
+
+/* mce=off disables machine check. Note you can reenable it later
+ using sysfs */
+static int __init mcheck_enable(char *str)
+{
+ if (!strcmp(str, "off"))
+ mce_dont_init = 1;
+ else
+ printk("mce= argument %s ignored. Please use /sys", str);
+ return 0;
+}
+
+__setup("nomce", mcheck_disable);
+__setup("mce", mcheck_enable);
+
+/*
+ * Sysfs support
+ */
+
+/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */
+static int mce_resume(struct sys_device *dev)
+{
+ on_each_cpu(mce_init, NULL, 1, 1);
+ return 0;
+}
+
+/* Reinit MCEs after user configuration changes */
+static void mce_restart(void)
+{
+ if (check_interval)
+ cancel_delayed_work(&mcheck_work);
+ /* Timer race is harmless here */
+ on_each_cpu(mce_init, NULL, 1, 1);
+ if (check_interval)
+ schedule_delayed_work(&mcheck_work, check_interval*HZ);
+}
+
+static struct sysdev_class mce_sysclass = {
+ .resume = mce_resume,
+ set_kset_name("machinecheck"),
+};
+
+static struct sys_device device_mce = {
+ .id = 0,
+ .cls = &mce_sysclass,
+};
+
+/* Why are there no generic functions for this? */
+#define ACCESSOR(name, var, start) \
+ static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
+ return sprintf(buf, "%lx\n", (unsigned long)var); \
+ } \
+ static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
+ char *end; \
+ unsigned long new = simple_strtoul(buf, &end, 0); \
+ if (end == buf) return -EINVAL; \
+ var = new; \
+ start; \
+ return end-buf; \
+ } \
+ static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
+
+ACCESSOR(bank0ctl,bank[0],mce_restart())
+ACCESSOR(bank1ctl,bank[1],mce_restart())
+ACCESSOR(bank2ctl,bank[2],mce_restart())
+ACCESSOR(bank3ctl,bank[3],mce_restart())
+ACCESSOR(bank4ctl,bank[4],mce_restart())
+ACCESSOR(tolerant,tolerant,)
+ACCESSOR(check_interval,check_interval,mce_restart())
+
+static __init int mce_init_device(void)
+{
+ int err;
+ if (!mce_available(&boot_cpu_data))
+ return -EIO;
+ err = sysdev_class_register(&mce_sysclass);
+ if (!err)
+ err = sysdev_register(&device_mce);
+ if (!err) {
+ /* could create per CPU objects, but it is not worth it. */
+ sysdev_create_file(&device_mce, &attr_bank0ctl);
+ sysdev_create_file(&device_mce, &attr_bank1ctl);
+ sysdev_create_file(&device_mce, &attr_bank2ctl);
+ sysdev_create_file(&device_mce, &attr_bank3ctl);
+ sysdev_create_file(&device_mce, &attr_bank4ctl);
+ sysdev_create_file(&device_mce, &attr_tolerant);
+ sysdev_create_file(&device_mce, &attr_check_interval);
+ }
+
+ misc_register(&mce_log_device);
+ return err;
+
+}
+device_initcall(mce_init_device);
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
new file mode 100644
index 000000000000..4db9a640069f
--- /dev/null
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -0,0 +1,99 @@
+/*
+ * Intel specific MCE features.
+ * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/mce.h>
+#include <asm/hw_irq.h>
+
+static DEFINE_PER_CPU(unsigned long, next_check);
+
+asmlinkage void smp_thermal_interrupt(void)
+{
+ struct mce m;
+
+ ack_APIC_irq();
+
+ irq_enter();
+ if (time_before(jiffies, __get_cpu_var(next_check)))
+ goto done;
+
+ __get_cpu_var(next_check) = jiffies + HZ*300;
+ memset(&m, 0, sizeof(m));
+ m.cpu = smp_processor_id();
+ m.bank = MCE_THERMAL_BANK;
+ rdtscll(m.tsc);
+ rdmsrl(MSR_IA32_THERM_STATUS, m.status);
+ if (m.status & 0x1) {
+ printk(KERN_EMERG
+ "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
+ add_taint(TAINT_MACHINE_CHECK);
+ } else {
+ printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
+ }
+
+ mce_log(&m);
+done:
+ irq_exit();
+}
+
+static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+{
+ u32 l, h;
+ int tm2 = 0;
+ unsigned int cpu = smp_processor_id();
+
+ if (!cpu_has(c, X86_FEATURE_ACPI))
+ return;
+
+ if (!cpu_has(c, X86_FEATURE_ACC))
+ return;
+
+ /* first check if TM1 is already enabled by the BIOS, in which
+ * case there might be some SMM goo which handles it, so we can't even
+ * put a handler since it might be delivered via SMI already.
+ */
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ h = apic_read(APIC_LVTTHMR);
+ if ((l & (1 << 3)) && (h & APIC_DM_SMI)) {
+ printk(KERN_DEBUG
+ "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+ return;
+ }
+
+ if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+ tm2 = 1;
+
+ if (h & APIC_VECTOR_MASK) {
+ printk(KERN_DEBUG
+ "CPU%d: Thermal LVT vector (%#x) already "
+ "installed\n", cpu, (h & APIC_VECTOR_MASK));
+ return;
+ }
+
+ h = THERMAL_APIC_VECTOR;
+ h |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+ apic_write_around(APIC_LVTTHMR, h);
+
+ rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
+ wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03, h);
+
+ rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+ wrmsr(MSR_IA32_MISC_ENABLE, l | (1 << 3), h);
+
+ l = apic_read(APIC_LVTTHMR);
+ apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+ printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
+ cpu, tm2 ? "TM2" : "TM1");
+ return;
+}
+
+void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+ intel_init_thermal(c);
+}
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
new file mode 100644
index 000000000000..c2ffea8845ed
--- /dev/null
+++ b/arch/x86_64/kernel/module.c
@@ -0,0 +1,166 @@
+/* Kernel module help for x86-64
+ Copyright (C) 2001 Rusty Russell.
+ Copyright (C) 2002,2003 Andi Kleen, SuSE Labs.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+#include <linux/moduleloader.h>
+#include <linux/elf.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#define DEBUGP(fmt...)
+
+void module_free(struct module *mod, void *module_region)
+{
+ vfree(module_region);
+}
+
+void *module_alloc(unsigned long size)
+{
+ struct vm_struct *area;
+
+ if (!size)
+ return NULL;
+ size = PAGE_ALIGN(size);
+ if (size > MODULES_LEN)
+ return NULL;
+
+ area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
+ if (!area)
+ return NULL;
+
+ return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC);
+}
+
+/* We don't need anything special. */
+int module_frob_arch_sections(Elf_Ehdr *hdr,
+ Elf_Shdr *sechdrs,
+ char *secstrings,
+ struct module *mod)
+{
+ return 0;
+}
+