From fd986379eef48d89afea07825808d92c35446ff8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 29 Feb 2012 15:22:34 +0100 Subject: [PATCH] - Update Xen patches to 3.3-rc5 and c/s 1157. - config.conf: Re-enable Xen configs. - Update config files. suse-commit: f22562233702e32624577b9254075e086c8da001 --- .../ABI/testing/sysfs-kernel-mm-cleancache | 11 - Documentation/kernel-parameters.txt | 29 + Documentation/vm/cleancache.txt | 41 +- Documentation/vm/frontswap.txt | 210 + arch/ia64/Kconfig | 2 +- arch/ia64/Makefile | 2 +- arch/ia64/include/asm/xen/hypervisor.h | 4 +- arch/ia64/include/asm/xen/interface.h | 25 +- arch/ia64/kernel/asm-offsets.c | 2 +- arch/ia64/kernel/vmlinux.lds.S | 2 +- arch/ia64/xen/Kconfig | 8 +- arch/ia64/xen/xcom_hcall.c | 2 +- arch/x86/Kbuild | 2 +- arch/x86/Kconfig | 173 +- arch/x86/Kconfig.cpu | 15 +- arch/x86/Kconfig.debug | 6 +- arch/x86/Makefile | 22 +- arch/x86/boot/Makefile | 17 +- arch/x86/ia32/ia32entry-xen.S | 383 ++ arch/x86/include/asm/acpi.h | 33 + arch/x86/include/asm/agp.h | 3 + arch/x86/include/asm/apic.h | 26 + arch/x86/include/asm/apicdef.h | 16 + arch/x86/include/asm/boot.h | 2 +- arch/x86/include/asm/cpufeature.h | 4 + arch/x86/include/asm/debugreg.h | 2 +- arch/x86/include/asm/e820.h | 4 + arch/x86/include/asm/hardirq.h | 4 + arch/x86/include/asm/hw_irq.h | 10 + arch/x86/include/asm/hypervisor.h | 4 + arch/x86/include/asm/i8259.h | 2 + arch/x86/include/asm/io.h | 4 +- arch/x86/include/asm/kexec.h | 29 + arch/x86/include/asm/mach_traps.h | 25 + arch/x86/include/asm/mc146818rtc.h | 2 +- arch/x86/include/asm/mmu.h | 5 +- arch/x86/include/asm/nmi.h | 3 + arch/x86/include/asm/page_64_types.h | 8 + arch/x86/include/asm/ptrace.h | 14 +- arch/x86/include/asm/required-features.h | 2 +- arch/x86/include/asm/segment.h | 4 +- arch/x86/include/asm/thread_info.h | 8 + arch/x86/include/asm/topology.h | 2 +- arch/x86/include/asm/trampoline.h | 2 +- arch/x86/include/asm/traps.h | 6 + arch/x86/include/asm/uv/uv_hub.h | 2 +- arch/x86/include/asm/xen/hypercall.h | 1 + arch/x86/include/asm/xen/hypervisor.h | 2 +- arch/x86/include/asm/xen/interface.h | 26 +- arch/x86/include/mach-xen/asm/agp.h | 58 + arch/x86/include/mach-xen/asm/cmpxchg.h | 11 + arch/x86/include/mach-xen/asm/cmpxchg_32.h | 24 + arch/x86/include/mach-xen/asm/cmpxchg_64.h | 11 + arch/x86/include/mach-xen/asm/desc.h | 433 ++ arch/x86/include/mach-xen/asm/dma-mapping.h | 25 + arch/x86/include/mach-xen/asm/fixmap.h | 240 ++ arch/x86/include/mach-xen/asm/gnttab_dma.h | 41 + arch/x86/include/mach-xen/asm/highmem.h | 98 + arch/x86/include/mach-xen/asm/hypercall.h | 439 ++ arch/x86/include/mach-xen/asm/hypercall_32.h | 62 + arch/x86/include/mach-xen/asm/hypercall_64.h | 54 + arch/x86/include/mach-xen/asm/hypervisor.h | 392 ++ arch/x86/include/mach-xen/asm/i387.h | 55 + arch/x86/include/mach-xen/asm/io.h | 343 ++ arch/x86/include/mach-xen/asm/ipi.h | 13 + arch/x86/include/mach-xen/asm/irq_vectors.h | 98 + arch/x86/include/mach-xen/asm/irqflags.h | 212 + arch/x86/include/mach-xen/asm/mach_traps.h | 37 + arch/x86/include/mach-xen/asm/maddr.h | 155 + arch/x86/include/mach-xen/asm/maddr_32.h | 35 + arch/x86/include/mach-xen/asm/maddr_64.h | 21 + arch/x86/include/mach-xen/asm/mmu_context.h | 165 + arch/x86/include/mach-xen/asm/mutex.h | 3 + arch/x86/include/mach-xen/asm/pci.h | 180 + arch/x86/include/mach-xen/asm/percpu.h | 61 + arch/x86/include/mach-xen/asm/perf_event.h | 42 + arch/x86/include/mach-xen/asm/pgalloc.h | 159 + arch/x86/include/mach-xen/asm/pgtable-3level.h | 152 + .../include/mach-xen/asm/pgtable-3level_types.h | 44 + arch/x86/include/mach-xen/asm/pgtable.h | 885 +++++ arch/x86/include/mach-xen/asm/pgtable_32.h | 89 + arch/x86/include/mach-xen/asm/pgtable_64.h | 203 + arch/x86/include/mach-xen/asm/pgtable_64_types.h | 64 + arch/x86/include/mach-xen/asm/pgtable_types.h | 392 ++ arch/x86/include/mach-xen/asm/probe_roms.h | 10 + arch/x86/include/mach-xen/asm/processor.h | 978 +++++ arch/x86/include/mach-xen/asm/setup.h | 21 + arch/x86/include/mach-xen/asm/smp-processor-id.h | 36 + arch/x86/include/mach-xen/asm/smp.h | 241 ++ arch/x86/include/mach-xen/asm/spinlock.h | 379 ++ arch/x86/include/mach-xen/asm/spinlock_types.h | 62 + arch/x86/include/mach-xen/asm/swiotlb.h | 8 + arch/x86/include/mach-xen/asm/system.h | 520 +++ arch/x86/include/mach-xen/asm/time.h | 18 + arch/x86/include/mach-xen/asm/tlbflush.h | 114 + arch/x86/include/mach-xen/asm/vga.h | 20 + arch/x86/include/mach-xen/asm/xenoprof.h | 48 + arch/x86/include/mach-xen/asm/xor.h | 8 + arch/x86/include/mach-xen/asm/xor_64.h | 339 ++ arch/x86/kernel/Makefile | 7 + arch/x86/kernel/acpi/Makefile | 4 + arch/x86/kernel/acpi/boot.c | 25 +- arch/x86/kernel/acpi/processor_extcntl_xen.c | 287 ++ arch/x86/kernel/amd_nb.c | 6 + arch/x86/kernel/apic/Makefile | 4 + arch/x86/kernel/apic/apic-xen.c | 69 + arch/x86/kernel/apic/hw_nmi.c | 8 + arch/x86/kernel/apic/io_apic-xen.c | 4199 ++++++++++++++++++++ arch/x86/kernel/apic/ipi-xen.c | 43 + arch/x86/kernel/apic/probe_32-xen.c | 57 + arch/x86/kernel/asm-offsets.c | 4 +- arch/x86/kernel/asm-offsets_32.c | 14 +- arch/x86/kernel/asm-offsets_64.c | 2 + arch/x86/kernel/cpu/Makefile | 3 + arch/x86/kernel/cpu/amd.c | 18 +- arch/x86/kernel/cpu/bugs.c | 6 + arch/x86/kernel/cpu/bugs_64.c | 2 + arch/x86/kernel/cpu/common-xen.c | 1435 +++++++ arch/x86/kernel/cpu/intel.c | 17 + arch/x86/kernel/cpu/intel_cacheinfo.c | 17 +- arch/x86/kernel/cpu/mcheck/Makefile | 1 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 4 +- arch/x86/kernel/cpu/mcheck/mce.c | 28 + arch/x86/kernel/cpu/mcheck/mce_dom0.c | 185 + arch/x86/kernel/cpu/mtrr/Makefile | 1 + arch/x86/kernel/cpu/mtrr/main-xen.c | 326 ++ arch/x86/kernel/cpu/proc.c | 8 +- arch/x86/kernel/cpu/scattered.c | 2 + arch/x86/kernel/cpu/topology.c | 2 +- arch/x86/kernel/dumpstack_64.c | 4 + arch/x86/kernel/e820-xen.c | 1291 ++++++ arch/x86/kernel/early_printk-xen.c | 291 ++ arch/x86/kernel/entry_32-xen.S | 1722 ++++++++ arch/x86/kernel/entry_32.S | 10 +- arch/x86/kernel/entry_64-xen.S | 1385 +++++++ arch/x86/kernel/entry_64.S | 6 +- arch/x86/kernel/fixup.c | 89 + arch/x86/kernel/head-xen.c | 223 ++ arch/x86/kernel/head32-xen.c | 103 + arch/x86/kernel/head64-xen.c | 146 + arch/x86/kernel/head_32-xen.S | 220 + arch/x86/kernel/head_64-xen.S | 176 + arch/x86/kernel/init_task.c | 3 +- arch/x86/kernel/ioport-xen.c | 84 + arch/x86/kernel/irq-xen.c | 350 ++ arch/x86/kernel/irq_work-xen.c | 21 + arch/x86/kernel/ldt-xen.c | 272 ++ arch/x86/kernel/machine_kexec_32.c | 107 +- arch/x86/kernel/machine_kexec_64.c | 178 +- arch/x86/kernel/machine_kexec_xen.c | 29 + arch/x86/kernel/microcode_core-xen.c | 299 ++ arch/x86/kernel/mmconf-fam10h_64.c | 8 + arch/x86/kernel/mpparse-xen.c | 962 +++++ arch/x86/kernel/msr-xen.c | 337 ++ arch/x86/kernel/nmi.c | 17 +- arch/x86/kernel/pci-dma-xen.c | 366 ++ arch/x86/kernel/pci-nommu-xen.c | 114 + arch/x86/kernel/pcspeaker.c | 5 + arch/x86/kernel/probe_roms.c | 7 +- arch/x86/kernel/process-xen.c | 630 +++ arch/x86/kernel/process_32-xen.c | 446 +++ arch/x86/kernel/process_64-xen.c | 695 ++++ arch/x86/kernel/quirks.c | 17 +- arch/x86/kernel/relocate_kernel_32.S | 39 +- arch/x86/kernel/relocate_kernel_64.S | 36 +- arch/x86/kernel/resource.c | 8 + arch/x86/kernel/rtc.c | 9 + arch/x86/kernel/setup-xen.c | 1477 +++++++ arch/x86/kernel/setup_percpu.c | 4 + arch/x86/kernel/smp-xen.c | 237 ++ arch/x86/kernel/syscall_32-xen.c | 20 + arch/x86/kernel/time-xen.c | 626 +++ arch/x86/kernel/traps-xen.c | 747 ++++ arch/x86/kernel/vm86_32.c | 12 + arch/x86/kernel/vmlinux.lds.S | 10 +- arch/x86/kernel/vsyscall_64-xen.c | 363 ++ arch/x86/kernel/x8664_ksyms_64.c | 2 +- arch/x86/kernel/x86_init-xen.c | 107 + arch/x86/kvm/Kconfig | 1 + arch/x86/lib/Makefile | 3 + arch/x86/lib/cache-smp-xen.c | 27 + arch/x86/lib/memset_64-xen.S | 154 + arch/x86/lib/scrub.c | 21 + arch/x86/mm/Makefile | 3 + arch/x86/mm/dump_pagetables-xen.c | 392 ++ arch/x86/mm/fault-xen.c | 1241 ++++++ arch/x86/mm/highmem_32-xen.c | 196 + arch/x86/mm/hypervisor.c | 1314 ++++++ arch/x86/mm/init-xen.c | 503 +++ arch/x86/mm/init_32-xen.c | 1019 +++++ arch/x86/mm/init_64-xen.c | 1379 +++++++ arch/x86/mm/iomap_32-xen.c | 121 + arch/x86/mm/ioremap-xen.c | 827 ++++ arch/x86/mm/pageattr-xen.c | 1545 +++++++ arch/x86/mm/pat-xen.c | 840 ++++ arch/x86/mm/pat_internal.h | 4 + arch/x86/mm/pgtable-xen.c | 970 +++++ arch/x86/mm/pgtable_32-xen.c | 179 + arch/x86/mm/physaddr.c | 4 + arch/x86/oprofile/Makefile | 7 + arch/x86/oprofile/xenoprof.c | 179 + arch/x86/pci/Makefile | 3 + arch/x86/pci/amd_bus.c | 15 + arch/x86/pci/i386.c | 2 + arch/x86/pci/irq.c | 9 +- arch/x86/pci/mmconfig-shared.c | 29 + arch/x86/pci/pcifront.c | 59 + arch/x86/platform/efi/Makefile | 1 + arch/x86/platform/efi/efi-xen.c | 507 +++ arch/x86/platform/sfi/sfi.c | 7 + arch/x86/power/Makefile | 2 + arch/x86/vdso/Makefile | 1 + arch/x86/vdso/vclock_gettime.c | 13 +- arch/x86/vdso/vdso32-setup-xen.c | 491 +++ arch/x86/vdso/vdso32.S | 2 +- arch/x86/vdso/vdso32/note.S | 6 +- arch/x86/vdso/vdso32/syscall.S | 2 + arch/x86/xen/Kconfig | 19 +- arch/x86/xen/enlighten.c | 8 +- arch/x86/xen/xen-head.S | 4 +- drivers/Makefile | 3 +- drivers/acpi/Kconfig | 16 +- drivers/acpi/Makefile | 1 + drivers/acpi/acpi_memhotplug.c | 22 + drivers/acpi/acpica/hwsleep.c | 16 + drivers/acpi/osl.c | 6 +- drivers/acpi/pci_irq.c | 77 + drivers/acpi/pci_root.c | 70 + drivers/acpi/processor_core.c | 51 +- drivers/acpi/processor_driver.c | 167 +- drivers/acpi/processor_extcntl.c | 214 + drivers/acpi/processor_idle.c | 42 +- drivers/acpi/processor_perflib.c | 31 +- drivers/acpi/scan.c | 25 + drivers/acpi/sleep.c | 2 + drivers/base/cpu.c | 4 +- drivers/block/Kconfig | 10 +- drivers/block/Makefile | 4 +- drivers/block/floppy.c | 2 + drivers/block/xen-blkback/Makefile | 2 +- drivers/cdrom/Makefile | 1 + drivers/char/Kconfig | 2 +- drivers/char/agp/agp.h | 4 + drivers/char/agp/amd-k7-agp.c | 4 +- drivers/char/agp/amd64-agp.c | 4 +- drivers/char/agp/ati-agp.c | 4 +- drivers/char/agp/efficeon-agp.c | 2 +- drivers/char/agp/generic.c | 8 +- drivers/char/agp/intel-gtt.c | 18 + drivers/char/agp/sworks-agp.c | 6 +- drivers/char/mem.c | 16 + drivers/char/tpm/Kconfig | 9 + drivers/char/tpm/Makefile | 2 + drivers/char/tpm/tpm.h | 15 + drivers/char/tpm/tpm_vtpm.c | 543 +++ drivers/char/tpm/tpm_vtpm.h | 55 + drivers/char/tpm/tpm_xen.c | 720 ++++ drivers/cpufreq/Kconfig | 1 + drivers/cpuidle/Kconfig | 1 + drivers/dma/Kconfig | 2 +- drivers/dma/ioat/Makefile | 3 +- drivers/dma/ioat/dca.c | 12 + drivers/dma/ioat/dma.h | 17 + drivers/dma/ioat/dma_v2.h | 6 + drivers/dma/ioat/hw.h | 4 + drivers/dma/ioat/pci.c | 7 +- drivers/edac/Kconfig | 7 +- drivers/edac/edac_mc.c | 4 + drivers/edac/i7core_edac.c | 2 +- drivers/edac/sb_edac.c | 4 + drivers/firmware/Kconfig | 1 + drivers/firmware/dcdbas.c | 28 +- drivers/firmware/dell_rbu.c | 45 +- drivers/firmware/dmi_scan.c | 5 + drivers/gpu/drm/i915/i915_drv.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 5 + drivers/gpu/drm/i915/i915_gem.c | 11 + drivers/gpu/drm/i915/intel_display.c | 4 + drivers/gpu/drm/radeon/radeon_device.c | 12 + drivers/gpu/drm/ttm/ttm_bo.c | 8 + drivers/gpu/drm/ttm/ttm_bo_vm.c | 6 + drivers/gpu/drm/ttm/ttm_page_alloc.c | 29 + drivers/gpu/drm/vmwgfx/Kconfig | 2 +- drivers/hv/Kconfig | 2 +- drivers/hwmon/Kconfig | 6 +- drivers/hwmon/coretemp-xen.c | 888 +++++ drivers/hwmon/via-cputemp-xen.c | 397 ++ drivers/ide/ide-lib.c | 11 + drivers/idle/Kconfig | 2 +- drivers/iommu/Kconfig | 1 + drivers/misc/Kconfig | 2 +- drivers/net/Kconfig | 14 +- drivers/net/Makefile | 4 +- drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c | 10 + drivers/net/ethernet/chelsio/cxgb3/sge.c | 62 +- drivers/net/ethernet/chelsio/cxgb3/version.h | 4 + drivers/net/xen-netback/Makefile | 2 +- drivers/oprofile/buffer_sync.c | 81 +- drivers/oprofile/cpu_buffer.c | 73 +- drivers/oprofile/cpu_buffer.h | 12 +- drivers/oprofile/event_buffer.h | 3 + drivers/oprofile/oprof.c | 32 + drivers/oprofile/oprof.h | 3 + drivers/oprofile/oprofile_files.c | 145 + drivers/pci/Kconfig | 49 +- drivers/pci/Makefile | 7 +- drivers/pci/guestdev.c | 881 ++++ drivers/pci/iomulti.c | 904 +++++ drivers/pci/iomulti.h | 122 + drivers/pci/msi-xen.c | 838 ++++ drivers/pci/pci-iomul.c | 440 ++ drivers/pci/pci.c | 12 + drivers/pci/pci.h | 22 + drivers/pci/probe.c | 16 +- drivers/pci/reserve.c | 137 + drivers/pci/setup-bus.c | 8 +- drivers/rtc/Kconfig | 2 +- drivers/scsi/Kconfig | 2 +- drivers/scsi/arcmsr/arcmsr.h | 2 +- drivers/sfi/sfi_core.c | 5 + drivers/staging/zcache/zcache-main.c | 10 +- drivers/tty/hvc/Kconfig | 2 +- drivers/tty/serial/8250/Kconfig | 1 + drivers/tty/tty_io.c | 9 +- drivers/video/Kconfig | 4 +- drivers/virtio/Kconfig | 1 + drivers/watchdog/Kconfig | 2 +- drivers/watchdog/xen_wdt.c | 32 +- drivers/xen/Kconfig | 506 ++- drivers/xen/Makefile | 57 +- drivers/xen/balloon/Makefile | 2 + drivers/xen/balloon/balloon.c | 804 ++++ drivers/xen/balloon/common.h | 57 + drivers/xen/balloon/sysfs.c | 209 + drivers/xen/blkback/Makefile | 4 + drivers/xen/blkback/blkback-pagemap.c | 97 + drivers/xen/blkback/blkback-pagemap.h | 37 + drivers/xen/blkback/blkback.c | 760 ++++ drivers/xen/blkback/cdrom.c | 154 + drivers/xen/blkback/common.h | 161 + drivers/xen/blkback/interface.c | 146 + drivers/xen/blkback/vbd.c | 212 + drivers/xen/blkback/xenbus.c | 662 +++ drivers/xen/blkfront/Makefile | 5 + drivers/xen/blkfront/blkfront.c | 1150 ++++++ drivers/xen/blkfront/block.h | 174 + drivers/xen/blkfront/vbd.c | 554 +++ drivers/xen/blkfront/vcd.c | 496 +++ drivers/xen/blktap/Makefile | 5 + drivers/xen/blktap/blktap.c | 1784 +++++++++ drivers/xen/blktap/blocktap.c | 1 + drivers/xen/blktap/common.h | 112 + drivers/xen/blktap/interface.c | 140 + drivers/xen/blktap/xenbus.c | 517 +++ drivers/xen/blktap2-new/Makefile | 4 + drivers/xen/blktap2-new/blktap.h | 218 + drivers/xen/blktap2-new/control.c | 316 ++ drivers/xen/blktap2-new/device.c | 572 +++ drivers/xen/blktap2-new/request.c | 418 ++ drivers/xen/blktap2-new/ring.c | 547 +++ drivers/xen/blktap2-new/sysfs.c | 299 ++ drivers/xen/blktap2/Makefile | 4 + drivers/xen/blktap2/blktap.h | 264 ++ drivers/xen/blktap2/control.c | 285 ++ drivers/xen/blktap2/device.c | 1176 ++++++ drivers/xen/blktap2/request.c | 296 ++ drivers/xen/blktap2/ring.c | 610 +++ drivers/xen/blktap2/sysfs.c | 475 +++ drivers/xen/blktap2/wait_queue.c | 40 + drivers/xen/char/Makefile | 1 + drivers/xen/char/mem.c | 222 ++ drivers/xen/console/Makefile | 2 + drivers/xen/console/console.c | 755 ++++ drivers/xen/console/xencons_ring.c | 141 + drivers/xen/core/Makefile | 16 + drivers/xen/core/acpi_memhotplug.c | 190 + drivers/xen/core/clockevents.c | 293 ++ drivers/xen/core/cpu_hotplug.c | 182 + drivers/xen/core/domctl.c | 562 +++ drivers/xen/core/domctl.h | 4 + drivers/xen/core/evtchn.c | 1992 ++++++++++ drivers/xen/core/firmware.c | 75 + drivers/xen/core/gnttab.c | 890 +++++ drivers/xen/core/machine_kexec.c | 403 ++ drivers/xen/core/machine_reboot.c | 285 ++ drivers/xen/core/pcpu.c | 447 +++ drivers/xen/core/reboot.c | 348 ++ drivers/xen/core/smpboot.c | 398 ++ drivers/xen/core/spinlock.c | 408 ++ drivers/xen/core/xen_proc.c | 30 + drivers/xen/evtchn.c | 22 +- drivers/xen/fbfront/Makefile | 2 + drivers/xen/fbfront/xenfb.c | 910 +++++ drivers/xen/fbfront/xenkbd.c | 366 ++ drivers/xen/features.c | 9 +- drivers/xen/gntdev/Makefile | 1 + drivers/xen/gntdev/gntdev.c | 1012 +++++ drivers/xen/netback/Makefile | 5 + drivers/xen/netback/accel.c | 269 ++ drivers/xen/netback/common.h | 297 ++ drivers/xen/netback/interface.c | 363 ++ drivers/xen/netback/loopback.c | 278 ++ drivers/xen/netback/netback.c | 1885 +++++++++ drivers/xen/netback/xenbus.c | 494 +++ drivers/xen/netfront/Makefile | 4 + drivers/xen/netfront/accel.c | 830 ++++ drivers/xen/netfront/netfront.c | 2274 +++++++++++ drivers/xen/netfront/netfront.h | 288 ++ drivers/xen/pci.c | 11 + drivers/xen/pcifront/Makefile | 5 + drivers/xen/pcifront/pci.c | 44 + drivers/xen/pcifront/pci_op.c | 671 ++++ drivers/xen/pcifront/pcifront.h | 57 + drivers/xen/pcifront/xenbus.c | 476 +++ drivers/xen/privcmd/Makefile | 3 + drivers/xen/privcmd/compat_privcmd.c | 140 + drivers/xen/privcmd/privcmd.c | 470 +++ drivers/xen/scsiback/Makefile | 4 + drivers/xen/scsiback/common.h | 173 + drivers/xen/scsiback/emulate.c | 480 +++ drivers/xen/scsiback/interface.c | 141 + drivers/xen/scsiback/scsiback.c | 730 ++++ drivers/xen/scsiback/translate.c | 168 + drivers/xen/scsiback/xenbus.c | 375 ++ drivers/xen/scsifront/Makefile | 3 + drivers/xen/scsifront/common.h | 135 + drivers/xen/scsifront/scsifront.c | 478 +++ drivers/xen/scsifront/xenbus.c | 424 ++ drivers/xen/sfc_netback/Makefile | 12 + drivers/xen/sfc_netback/accel.c | 147 + drivers/xen/sfc_netback/accel.h | 392 ++ drivers/xen/sfc_netback/accel_debugfs.c | 148 + drivers/xen/sfc_netback/accel_fwd.c | 420 ++ drivers/xen/sfc_netback/accel_msg.c | 391 ++ drivers/xen/sfc_netback/accel_solarflare.c | 1292 ++++++ drivers/xen/sfc_netback/accel_solarflare.h | 88 + drivers/xen/sfc_netback/accel_xenbus.c | 831 ++++ drivers/xen/sfc_netback/ci/compat.h | 53 + drivers/xen/sfc_netback/ci/compat/gcc.h | 158 + drivers/xen/sfc_netback/ci/compat/gcc_x86.h | 115 + drivers/xen/sfc_netback/ci/compat/primitive.h | 77 + drivers/xen/sfc_netback/ci/compat/sysdep.h | 166 + drivers/xen/sfc_netback/ci/compat/utils.h | 269 ++ drivers/xen/sfc_netback/ci/compat/x86.h | 48 + drivers/xen/sfc_netback/ci/compat/x86_64.h | 54 + drivers/xen/sfc_netback/ci/tools/config.h | 49 + drivers/xen/sfc_netback/ci/tools/debug.h | 336 ++ drivers/xen/sfc_netback/ci/tools/log.h | 269 ++ .../xen/sfc_netback/ci/tools/platform/gcc_x86.h | 370 ++ .../sfc_netback/ci/tools/platform/linux_kernel.h | 361 ++ drivers/xen/sfc_netback/ci/tools/sysdep.h | 132 + drivers/xen/sfc_netfront/Makefile | 11 + drivers/xen/sfc_netfront/accel.h | 495 +++ drivers/xen/sfc_netfront/accel_bufs.c | 393 ++ drivers/xen/sfc_netfront/accel_bufs.h | 181 + drivers/xen/sfc_netfront/accel_debugfs.c | 227 ++ drivers/xen/sfc_netfront/accel_msg.c | 567 +++ drivers/xen/sfc_netfront/accel_netfront.c | 330 ++ drivers/xen/sfc_netfront/accel_ssr.c | 308 ++ drivers/xen/sfc_netfront/accel_ssr.h | 88 + drivers/xen/sfc_netfront/accel_tso.c | 509 +++ drivers/xen/sfc_netfront/accel_tso.h | 57 + drivers/xen/sfc_netfront/accel_vi.c | 1203 ++++++ drivers/xen/sfc_netfront/accel_xenbus.c | 775 ++++ drivers/xen/sfc_netfront/ef_vi_falcon.h | 172 + drivers/xen/sfc_netfront/ef_vi_falcon_core.h | 1075 +++++ drivers/xen/sfc_netfront/ef_vi_falcon_desc.h | 43 + drivers/xen/sfc_netfront/ef_vi_falcon_event.h | 123 + drivers/xen/sfc_netfront/ef_vi_internal.h | 256 ++ drivers/xen/sfc_netfront/etherfabric/ef_vi.h | 647 +++ drivers/xen/sfc_netfront/falcon_event.c | 346 ++ drivers/xen/sfc_netfront/falcon_vi.c | 473 +++ drivers/xen/sfc_netfront/pt_tx.c | 91 + drivers/xen/sfc_netfront/sysdep.h | 185 + drivers/xen/sfc_netfront/vi_init.c | 183 + drivers/xen/sfc_netutil/Makefile | 11 + drivers/xen/sfc_netutil/accel_cuckoo_hash.c | 649 +++ drivers/xen/sfc_netutil/accel_cuckoo_hash.h | 227 ++ drivers/xen/sfc_netutil/accel_msg_iface.c | 301 ++ drivers/xen/sfc_netutil/accel_msg_iface.h | 415 ++ drivers/xen/sfc_netutil/accel_shared_fifo.h | 127 + drivers/xen/sfc_netutil/accel_util.c | 336 ++ drivers/xen/sfc_netutil/accel_util.h | 124 + drivers/xen/sys-hypervisor.c | 51 +- drivers/xen/tmem.c | 45 +- drivers/xen/tpmback/Makefile | 4 + drivers/xen/tpmback/common.h | 93 + drivers/xen/tpmback/interface.c | 133 + drivers/xen/tpmback/tpmback.c | 947 +++++ drivers/xen/tpmback/xenbus.c | 268 ++ drivers/xen/usbback/Makefile | 4 + drivers/xen/usbback/interface.c | 190 + drivers/xen/usbback/usbback.c | 1198 ++++++ drivers/xen/usbback/usbback.h | 170 + drivers/xen/usbback/usbstub.c | 324 ++ drivers/xen/usbback/xenbus.c | 334 ++ drivers/xen/usbfront/Makefile | 11 + drivers/xen/usbfront/usbfront-dbg.c | 101 + drivers/xen/usbfront/usbfront-hcd.c | 232 ++ drivers/xen/usbfront/usbfront-hub.c | 471 +++ drivers/xen/usbfront/usbfront-q.c | 542 +++ drivers/xen/usbfront/usbfront.h | 197 + drivers/xen/usbfront/xenbus.c | 414 ++ drivers/xen/util.c | 74 + drivers/xen/xen-pciback/Makefile | 16 +- drivers/xen/xen-pciback/conf_space_capability.c | 15 + drivers/xen/xen-pciback/conf_space_header.c | 9 +- drivers/xen/xen-pciback/controller.c | 450 +++ drivers/xen/xen-pciback/pci_stub.c | 53 + drivers/xen/xen-pciback/pciback.h | 21 +- drivers/xen/xen-pciback/pciback_ops.c | 52 + drivers/xen/xen-pciback/slot.c | 200 + drivers/xen/xen-pciback/xenbus.c | 122 +- drivers/xen/xen-selfballoon.c | 5 +- drivers/xen/xenbus/Makefile | 23 +- drivers/xen/xenbus/xenbus_backend_client.c | 106 + drivers/xen/xenbus/xenbus_client.c | 113 +- drivers/xen/xenbus/xenbus_comms.c | 71 +- drivers/xen/xenbus/xenbus_comms.h | 23 + drivers/xen/xenbus/xenbus_dev.c | 511 +++ drivers/xen/xenbus/xenbus_dev_backend.c | 4 +- drivers/xen/xenbus/xenbus_probe.c | 951 ++++- drivers/xen/xenbus/xenbus_probe.h | 30 + drivers/xen/xenbus/xenbus_probe_backend.c | 82 + drivers/xen/xenbus/xenbus_xs.c | 146 +- drivers/xen/xenoprof/xenoprofile.c | 585 +++ fs/Kconfig | 1 + fs/aio.c | 123 +- fs/block_dev.c | 2 +- fs/compat_ioctl.c | 23 + fs/proc/kcore.c | 6 +- fs/super.c | 2 +- include/acpi/processor.h | 148 + include/linux/acpi.h | 2 + include/linux/aio.h | 6 + include/linux/cleancache.h | 24 +- include/linux/console.h | 1 + include/linux/cpufreq.h | 2 +- include/linux/efi.h | 4 + include/linux/elfnote.h | 2 +- include/linux/frontswap.h | 126 + include/linux/highmem.h | 6 + include/linux/interrupt.h | 5 + include/linux/kexec.h | 20 + include/linux/mm.h | 22 + include/linux/nmi.h | 3 + include/linux/oprofile.h | 20 +- include/linux/page-flags.h | 46 +- include/linux/pci.h | 15 +- include/linux/swap.h | 4 + include/linux/swapfile.h | 13 + include/linux/sysctl.h | 1 + include/linux/vermagic.h | 7 +- include/xen/Kbuild | 3 +- include/xen/balloon.h | 67 +- include/xen/blkif.h | 159 + include/xen/clock.h | 18 + include/xen/compat_ioctl.h | 75 + include/xen/cpu_hotplug.h | 39 + include/xen/driver_util.h | 14 + include/xen/evtchn.h | 220 +- include/xen/features.h | 3 +- include/xen/firmware.h | 14 + include/xen/gntdev.h | 153 +- include/xen/gnttab.h | 207 + include/xen/hvm.h | 5 +- include/xen/hypercall.h | 30 + include/xen/interface/COPYING | 38 + include/xen/interface/arch-x86/cpuid.h | 68 + include/xen/interface/arch-x86/hvm/save.h | 583 +++ include/xen/interface/arch-x86/xen-mca.h | 440 ++ include/xen/interface/arch-x86/xen-x86_32.h | 171 + include/xen/interface/arch-x86/xen-x86_64.h | 202 + include/xen/interface/arch-x86/xen.h | 210 + include/xen/interface/arch-x86_32.h | 27 + include/xen/interface/arch-x86_64.h | 27 + include/xen/interface/callback.h | 9 + include/xen/interface/dom0_ops.h | 120 + include/xen/interface/domctl.h | 1013 +++++ include/xen/interface/elfnote.h | 118 +- include/xen/interface/event_channel.h | 74 +- include/xen/interface/features.h | 32 +- include/xen/interface/grant_table.h | 110 +- include/xen/interface/hvm/e820.h | 34 + include/xen/interface/hvm/hvm_info_table.h | 72 + include/xen/interface/hvm/hvm_op.h | 227 +- include/xen/interface/hvm/ioreq.h | 140 + include/xen/interface/hvm/params.h | 56 +- include/xen/interface/hvm/save.h | 111 + include/xen/interface/io/blkif.h | 138 +- include/xen/interface/io/cdromif.h | 120 + include/xen/interface/io/console.h | 18 + include/xen/interface/io/fbif.h | 26 +- include/xen/interface/io/fsif.h | 192 + include/xen/interface/io/libxenvchan.h | 97 + include/xen/interface/io/netif.h | 71 +- include/xen/interface/io/protocols.h | 22 + include/xen/interface/io/ring.h | 94 +- include/xen/interface/io/tpmif.h | 77 + include/xen/interface/io/usbif.h | 151 + include/xen/interface/io/vscsiif.h | 105 + include/xen/interface/io/xenbus.h | 1 + include/xen/interface/io/xs_wire.h | 35 +- include/xen/interface/kexec.h | 168 + include/xen/interface/mem_event.h | 90 + include/xen/interface/memory.h | 160 +- include/xen/interface/nmi.h | 80 + include/xen/interface/physdev.h | 93 +- include/xen/interface/platform.h | 268 +- include/xen/interface/sched.h | 32 +- include/xen/interface/sysctl.h | 640 +++ include/xen/interface/tmem.h | 148 + include/xen/interface/trace.h | 245 ++ include/xen/interface/vcpu.h | 62 +- include/xen/interface/version.h | 44 +- include/xen/interface/xen-compat.h | 44 + include/xen/interface/xen.h | 537 ++- include/xen/interface/xenoprof.h | 152 + include/xen/interface/xsm/acm.h | 223 ++ include/xen/interface/xsm/acm_ops.h | 159 + include/xen/interface/xsm/flask_op.h | 184 + include/xen/net-util.h | 75 + include/xen/pcifront.h | 69 + include/xen/pcpu.h | 19 + include/xen/privcmd.h | 80 +- include/xen/public/Kbuild | 5 + include/xen/public/evtchn.h | 88 + include/xen/public/gntdev.h | 150 + include/xen/public/iomulti.h | 50 + include/xen/public/privcmd.h | 86 + include/xen/public/xenbus.h | 52 + include/xen/sysctl.h | 11 + include/xen/xen.h | 6 +- include/xen/xen_proc.h | 12 + include/xen/xenbus.h | 144 +- include/xen/xencons.h | 17 + include/xen/xenoprof.h | 42 + kernel/Kconfig.preempt | 1 + kernel/irq/spurious.c | 2 +- kernel/kexec.c | 96 +- kernel/ksysfs.c | 4 + kernel/power/Kconfig | 4 +- kernel/sched/core.c | 62 +- kernel/sysctl.c | 2 +- kernel/sysctl_binary.c | 12 + kernel/time/timekeeping.c | 6 + lib/swiotlb-xen.c | 808 ++++ mm/Kconfig | 19 +- mm/Makefile | 1 + mm/cleancache.c | 98 +- mm/filemap.c | 2 +- mm/frontswap.c | 272 ++ mm/init-mm.c | 4 + mm/memory.c | 46 +- mm/mmap.c | 14 + mm/page_alloc.c | 29 + mm/page_io.c | 12 + mm/swapfile.c | 64 +- mm/tmem-xen.c | 56 + mm/truncate.c | 10 +- mm/vmalloc.c | 30 + net/ipv6/addrconf.c | 2 + scripts/Makefile.build | 15 + scripts/Makefile.lib | 6 + scripts/Makefile.xen.awk | 34 + 665 files changed, 124049 insertions(+), 1515 deletions(-) delete mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cleancache create mode 100644 Documentation/vm/frontswap.txt create mode 100644 arch/x86/ia32/ia32entry-xen.S create mode 100644 arch/x86/include/mach-xen/asm/agp.h create mode 100644 arch/x86/include/mach-xen/asm/cmpxchg.h create mode 100644 arch/x86/include/mach-xen/asm/cmpxchg_32.h create mode 100644 arch/x86/include/mach-xen/asm/cmpxchg_64.h create mode 100644 arch/x86/include/mach-xen/asm/desc.h create mode 100644 arch/x86/include/mach-xen/asm/dma-mapping.h create mode 100644 arch/x86/include/mach-xen/asm/fixmap.h create mode 100644 arch/x86/include/mach-xen/asm/gnttab_dma.h create mode 100644 arch/x86/include/mach-xen/asm/highmem.h create mode 100644 arch/x86/include/mach-xen/asm/hypercall.h create mode 100644 arch/x86/include/mach-xen/asm/hypercall_32.h create mode 100644 arch/x86/include/mach-xen/asm/hypercall_64.h create mode 100644 arch/x86/include/mach-xen/asm/hypervisor.h create mode 100644 arch/x86/include/mach-xen/asm/i387.h create mode 100644 arch/x86/include/mach-xen/asm/io.h create mode 100644 arch/x86/include/mach-xen/asm/ipi.h create mode 100644 arch/x86/include/mach-xen/asm/irq_vectors.h create mode 100644 arch/x86/include/mach-xen/asm/irqflags.h create mode 100644 arch/x86/include/mach-xen/asm/mach_traps.h create mode 100644 arch/x86/include/mach-xen/asm/maddr.h create mode 100644 arch/x86/include/mach-xen/asm/maddr_32.h create mode 100644 arch/x86/include/mach-xen/asm/maddr_64.h create mode 100644 arch/x86/include/mach-xen/asm/mmu_context.h create mode 100644 arch/x86/include/mach-xen/asm/mutex.h create mode 100644 arch/x86/include/mach-xen/asm/pci.h create mode 100644 arch/x86/include/mach-xen/asm/percpu.h create mode 100644 arch/x86/include/mach-xen/asm/perf_event.h create mode 100644 arch/x86/include/mach-xen/asm/pgalloc.h create mode 100644 arch/x86/include/mach-xen/asm/pgtable-3level.h create mode 100644 arch/x86/include/mach-xen/asm/pgtable-3level_types.h create mode 100644 arch/x86/include/mach-xen/asm/pgtable.h create mode 100644 arch/x86/include/mach-xen/asm/pgtable_32.h create mode 100644 arch/x86/include/mach-xen/asm/pgtable_64.h create mode 100644 arch/x86/include/mach-xen/asm/pgtable_64_types.h create mode 100644 arch/x86/include/mach-xen/asm/pgtable_types.h create mode 100644 arch/x86/include/mach-xen/asm/probe_roms.h create mode 100644 arch/x86/include/mach-xen/asm/processor.h create mode 100644 arch/x86/include/mach-xen/asm/setup.h create mode 100644 arch/x86/include/mach-xen/asm/smp-processor-id.h create mode 100644 arch/x86/include/mach-xen/asm/smp.h create mode 100644 arch/x86/include/mach-xen/asm/spinlock.h create mode 100644 arch/x86/include/mach-xen/asm/spinlock_types.h create mode 100644 arch/x86/include/mach-xen/asm/swiotlb.h create mode 100644 arch/x86/include/mach-xen/asm/system.h create mode 100644 arch/x86/include/mach-xen/asm/time.h create mode 100644 arch/x86/include/mach-xen/asm/tlbflush.h create mode 100644 arch/x86/include/mach-xen/asm/vga.h create mode 100644 arch/x86/include/mach-xen/asm/xenoprof.h create mode 100644 arch/x86/include/mach-xen/asm/xor.h create mode 100644 arch/x86/include/mach-xen/asm/xor_64.h create mode 100644 arch/x86/kernel/acpi/processor_extcntl_xen.c create mode 100644 arch/x86/kernel/apic/apic-xen.c create mode 100644 arch/x86/kernel/apic/io_apic-xen.c create mode 100644 arch/x86/kernel/apic/ipi-xen.c create mode 100644 arch/x86/kernel/apic/probe_32-xen.c create mode 100644 arch/x86/kernel/cpu/common-xen.c create mode 100644 arch/x86/kernel/cpu/mcheck/mce_dom0.c create mode 100644 arch/x86/kernel/cpu/mtrr/main-xen.c create mode 100644 arch/x86/kernel/e820-xen.c create mode 100644 arch/x86/kernel/early_printk-xen.c create mode 100644 arch/x86/kernel/entry_32-xen.S create mode 100644 arch/x86/kernel/entry_64-xen.S create mode 100644 arch/x86/kernel/fixup.c create mode 100644 arch/x86/kernel/head-xen.c create mode 100644 arch/x86/kernel/head32-xen.c create mode 100644 arch/x86/kernel/head64-xen.c create mode 100644 arch/x86/kernel/head_32-xen.S create mode 100644 arch/x86/kernel/head_64-xen.S create mode 100644 arch/x86/kernel/ioport-xen.c create mode 100644 arch/x86/kernel/irq-xen.c create mode 100644 arch/x86/kernel/irq_work-xen.c create mode 100644 arch/x86/kernel/ldt-xen.c create mode 100644 arch/x86/kernel/machine_kexec_xen.c create mode 100644 arch/x86/kernel/microcode_core-xen.c create mode 100644 arch/x86/kernel/mpparse-xen.c create mode 100644 arch/x86/kernel/msr-xen.c create mode 100644 arch/x86/kernel/pci-dma-xen.c create mode 100644 arch/x86/kernel/pci-nommu-xen.c create mode 100644 arch/x86/kernel/process-xen.c create mode 100644 arch/x86/kernel/process_32-xen.c create mode 100644 arch/x86/kernel/process_64-xen.c create mode 100644 arch/x86/kernel/setup-xen.c create mode 100644 arch/x86/kernel/smp-xen.c create mode 100644 arch/x86/kernel/syscall_32-xen.c create mode 100644 arch/x86/kernel/time-xen.c create mode 100644 arch/x86/kernel/traps-xen.c create mode 100644 arch/x86/kernel/vsyscall_64-xen.c create mode 100644 arch/x86/kernel/x86_init-xen.c create mode 100644 arch/x86/lib/cache-smp-xen.c create mode 100644 arch/x86/lib/memset_64-xen.S create mode 100644 arch/x86/lib/scrub.c create mode 100644 arch/x86/mm/dump_pagetables-xen.c create mode 100644 arch/x86/mm/fault-xen.c create mode 100644 arch/x86/mm/highmem_32-xen.c create mode 100644 arch/x86/mm/hypervisor.c create mode 100644 arch/x86/mm/init-xen.c create mode 100644 arch/x86/mm/init_32-xen.c create mode 100644 arch/x86/mm/init_64-xen.c create mode 100644 arch/x86/mm/iomap_32-xen.c create mode 100644 arch/x86/mm/ioremap-xen.c create mode 100644 arch/x86/mm/pageattr-xen.c create mode 100644 arch/x86/mm/pat-xen.c create mode 100644 arch/x86/mm/pgtable-xen.c create mode 100644 arch/x86/mm/pgtable_32-xen.c create mode 100644 arch/x86/oprofile/xenoprof.c create mode 100644 arch/x86/pci/pcifront.c create mode 100644 arch/x86/platform/efi/efi-xen.c create mode 100644 arch/x86/vdso/vdso32-setup-xen.c create mode 100644 drivers/acpi/processor_extcntl.c create mode 100644 drivers/char/tpm/tpm_vtpm.c create mode 100644 drivers/char/tpm/tpm_vtpm.h create mode 100644 drivers/char/tpm/tpm_xen.c create mode 100644 drivers/hwmon/coretemp-xen.c create mode 100644 drivers/hwmon/via-cputemp-xen.c create mode 100644 drivers/pci/guestdev.c create mode 100644 drivers/pci/iomulti.c create mode 100644 drivers/pci/iomulti.h create mode 100644 drivers/pci/msi-xen.c create mode 100644 drivers/pci/pci-iomul.c create mode 100644 drivers/pci/reserve.c create mode 100644 drivers/xen/balloon/Makefile create mode 100644 drivers/xen/balloon/balloon.c create mode 100644 drivers/xen/balloon/common.h create mode 100644 drivers/xen/balloon/sysfs.c create mode 100644 drivers/xen/blkback/Makefile create mode 100644 drivers/xen/blkback/blkback-pagemap.c create mode 100644 drivers/xen/blkback/blkback-pagemap.h create mode 100644 drivers/xen/blkback/blkback.c create mode 100644 drivers/xen/blkback/cdrom.c create mode 100644 drivers/xen/blkback/common.h create mode 100644 drivers/xen/blkback/interface.c create mode 100644 drivers/xen/blkback/vbd.c create mode 100644 drivers/xen/blkback/xenbus.c create mode 100644 drivers/xen/blkfront/Makefile create mode 100644 drivers/xen/blkfront/blkfront.c create mode 100644 drivers/xen/blkfront/block.h create mode 100644 drivers/xen/blkfront/vbd.c create mode 100644 drivers/xen/blkfront/vcd.c create mode 100644 drivers/xen/blktap/Makefile create mode 100644 drivers/xen/blktap/blktap.c create mode 100644 drivers/xen/blktap/blocktap.c create mode 100644 drivers/xen/blktap/common.h create mode 100644 drivers/xen/blktap/interface.c create mode 100644 drivers/xen/blktap/xenbus.c create mode 100644 drivers/xen/blktap2-new/Makefile create mode 100644 drivers/xen/blktap2-new/blktap.h create mode 100644 drivers/xen/blktap2-new/control.c create mode 100644 drivers/xen/blktap2-new/device.c create mode 100644 drivers/xen/blktap2-new/request.c create mode 100644 drivers/xen/blktap2-new/ring.c create mode 100644 drivers/xen/blktap2-new/sysfs.c create mode 100644 drivers/xen/blktap2/Makefile create mode 100644 drivers/xen/blktap2/blktap.h create mode 100644 drivers/xen/blktap2/control.c create mode 100644 drivers/xen/blktap2/device.c create mode 100644 drivers/xen/blktap2/request.c create mode 100644 drivers/xen/blktap2/ring.c create mode 100644 drivers/xen/blktap2/sysfs.c create mode 100644 drivers/xen/blktap2/wait_queue.c create mode 100644 drivers/xen/char/Makefile create mode 100644 drivers/xen/char/mem.c create mode 100644 drivers/xen/console/Makefile create mode 100644 drivers/xen/console/console.c create mode 100644 drivers/xen/console/xencons_ring.c create mode 100644 drivers/xen/core/Makefile create mode 100644 drivers/xen/core/acpi_memhotplug.c create mode 100644 drivers/xen/core/clockevents.c create mode 100644 drivers/xen/core/cpu_hotplug.c create mode 100644 drivers/xen/core/domctl.c create mode 100644 drivers/xen/core/domctl.h create mode 100644 drivers/xen/core/evtchn.c create mode 100644 drivers/xen/core/firmware.c create mode 100644 drivers/xen/core/gnttab.c create mode 100644 drivers/xen/core/machine_kexec.c create mode 100644 drivers/xen/core/machine_reboot.c create mode 100644 drivers/xen/core/pcpu.c create mode 100644 drivers/xen/core/reboot.c create mode 100644 drivers/xen/core/smpboot.c create mode 100644 drivers/xen/core/spinlock.c create mode 100644 drivers/xen/core/xen_proc.c create mode 100644 drivers/xen/fbfront/Makefile create mode 100644 drivers/xen/fbfront/xenfb.c create mode 100644 drivers/xen/fbfront/xenkbd.c create mode 100644 drivers/xen/gntdev/Makefile create mode 100644 drivers/xen/gntdev/gntdev.c create mode 100644 drivers/xen/netback/Makefile create mode 100644 drivers/xen/netback/accel.c create mode 100644 drivers/xen/netback/common.h create mode 100644 drivers/xen/netback/interface.c create mode 100644 drivers/xen/netback/loopback.c create mode 100644 drivers/xen/netback/netback.c create mode 100644 drivers/xen/netback/xenbus.c create mode 100644 drivers/xen/netfront/Makefile create mode 100644 drivers/xen/netfront/accel.c create mode 100644 drivers/xen/netfront/netfront.c create mode 100644 drivers/xen/netfront/netfront.h create mode 100644 drivers/xen/pcifront/Makefile create mode 100644 drivers/xen/pcifront/pci.c create mode 100644 drivers/xen/pcifront/pci_op.c create mode 100644 drivers/xen/pcifront/pcifront.h create mode 100644 drivers/xen/pcifront/xenbus.c create mode 100644 drivers/xen/privcmd/Makefile create mode 100644 drivers/xen/privcmd/compat_privcmd.c create mode 100644 drivers/xen/privcmd/privcmd.c create mode 100644 drivers/xen/scsiback/Makefile create mode 100644 drivers/xen/scsiback/common.h create mode 100644 drivers/xen/scsiback/emulate.c create mode 100644 drivers/xen/scsiback/interface.c create mode 100644 drivers/xen/scsiback/scsiback.c create mode 100644 drivers/xen/scsiback/translate.c create mode 100644 drivers/xen/scsiback/xenbus.c create mode 100644 drivers/xen/scsifront/Makefile create mode 100644 drivers/xen/scsifront/common.h create mode 100644 drivers/xen/scsifront/scsifront.c create mode 100644 drivers/xen/scsifront/xenbus.c create mode 100644 drivers/xen/sfc_netback/Makefile create mode 100644 drivers/xen/sfc_netback/accel.c create mode 100644 drivers/xen/sfc_netback/accel.h create mode 100644 drivers/xen/sfc_netback/accel_debugfs.c create mode 100644 drivers/xen/sfc_netback/accel_fwd.c create mode 100644 drivers/xen/sfc_netback/accel_msg.c create mode 100644 drivers/xen/sfc_netback/accel_solarflare.c create mode 100644 drivers/xen/sfc_netback/accel_solarflare.h create mode 100644 drivers/xen/sfc_netback/accel_xenbus.c create mode 100644 drivers/xen/sfc_netback/ci/compat.h create mode 100644 drivers/xen/sfc_netback/ci/compat/gcc.h create mode 100644 drivers/xen/sfc_netback/ci/compat/gcc_x86.h create mode 100644 drivers/xen/sfc_netback/ci/compat/primitive.h create mode 100644 drivers/xen/sfc_netback/ci/compat/sysdep.h create mode 100644 drivers/xen/sfc_netback/ci/compat/utils.h create mode 100644 drivers/xen/sfc_netback/ci/compat/x86.h create mode 100644 drivers/xen/sfc_netback/ci/compat/x86_64.h create mode 100644 drivers/xen/sfc_netback/ci/tools/config.h create mode 100644 drivers/xen/sfc_netback/ci/tools/debug.h create mode 100644 drivers/xen/sfc_netback/ci/tools/log.h create mode 100644 drivers/xen/sfc_netback/ci/tools/platform/gcc_x86.h create mode 100644 drivers/xen/sfc_netback/ci/tools/platform/linux_kernel.h create mode 100644 drivers/xen/sfc_netback/ci/tools/sysdep.h create mode 100644 drivers/xen/sfc_netfront/Makefile create mode 100644 drivers/xen/sfc_netfront/accel.h create mode 100644 drivers/xen/sfc_netfront/accel_bufs.c create mode 100644 drivers/xen/sfc_netfront/accel_bufs.h create mode 100644 drivers/xen/sfc_netfront/accel_debugfs.c create mode 100644 drivers/xen/sfc_netfront/accel_msg.c create mode 100644 drivers/xen/sfc_netfront/accel_netfront.c create mode 100644 drivers/xen/sfc_netfront/accel_ssr.c create mode 100644 drivers/xen/sfc_netfront/accel_ssr.h create mode 100644 drivers/xen/sfc_netfront/accel_tso.c create mode 100644 drivers/xen/sfc_netfront/accel_tso.h create mode 100644 drivers/xen/sfc_netfront/accel_vi.c create mode 100644 drivers/xen/sfc_netfront/accel_xenbus.c create mode 100644 drivers/xen/sfc_netfront/ef_vi_falcon.h create mode 100644 drivers/xen/sfc_netfront/ef_vi_falcon_core.h create mode 100644 drivers/xen/sfc_netfront/ef_vi_falcon_desc.h create mode 100644 drivers/xen/sfc_netfront/ef_vi_falcon_event.h create mode 100644 drivers/xen/sfc_netfront/ef_vi_internal.h create mode 100644 drivers/xen/sfc_netfront/etherfabric/ef_vi.h create mode 100644 drivers/xen/sfc_netfront/falcon_event.c create mode 100644 drivers/xen/sfc_netfront/falcon_vi.c create mode 100644 drivers/xen/sfc_netfront/pt_tx.c create mode 100644 drivers/xen/sfc_netfront/sysdep.h create mode 100644 drivers/xen/sfc_netfront/vi_init.c create mode 100644 drivers/xen/sfc_netutil/Makefile create mode 100644 drivers/xen/sfc_netutil/accel_cuckoo_hash.c create mode 100644 drivers/xen/sfc_netutil/accel_cuckoo_hash.h create mode 100644 drivers/xen/sfc_netutil/accel_msg_iface.c create mode 100644 drivers/xen/sfc_netutil/accel_msg_iface.h create mode 100644 drivers/xen/sfc_netutil/accel_shared_fifo.h create mode 100644 drivers/xen/sfc_netutil/accel_util.c create mode 100644 drivers/xen/sfc_netutil/accel_util.h create mode 100644 drivers/xen/tpmback/Makefile create mode 100644 drivers/xen/tpmback/common.h create mode 100644 drivers/xen/tpmback/interface.c create mode 100644 drivers/xen/tpmback/tpmback.c create mode 100644 drivers/xen/tpmback/xenbus.c create mode 100644 drivers/xen/usbback/Makefile create mode 100644 drivers/xen/usbback/interface.c create mode 100644 drivers/xen/usbback/usbback.c create mode 100644 drivers/xen/usbback/usbback.h create mode 100644 drivers/xen/usbback/usbstub.c create mode 100644 drivers/xen/usbback/xenbus.c create mode 100644 drivers/xen/usbfront/Makefile create mode 100644 drivers/xen/usbfront/usbfront-dbg.c create mode 100644 drivers/xen/usbfront/usbfront-hcd.c create mode 100644 drivers/xen/usbfront/usbfront-hub.c create mode 100644 drivers/xen/usbfront/usbfront-q.c create mode 100644 drivers/xen/usbfront/usbfront.h create mode 100644 drivers/xen/usbfront/xenbus.c create mode 100644 drivers/xen/util.c create mode 100644 drivers/xen/xen-pciback/controller.c create mode 100644 drivers/xen/xen-pciback/slot.c create mode 100644 drivers/xen/xenbus/xenbus_backend_client.c create mode 100644 drivers/xen/xenbus/xenbus_dev.c create mode 100644 drivers/xen/xenoprof/xenoprofile.c create mode 100644 include/linux/frontswap.h create mode 100644 include/linux/swapfile.h create mode 100644 include/xen/blkif.h create mode 100644 include/xen/clock.h create mode 100644 include/xen/compat_ioctl.h create mode 100644 include/xen/cpu_hotplug.h create mode 100644 include/xen/driver_util.h create mode 100644 include/xen/firmware.h create mode 100644 include/xen/gnttab.h create mode 100644 include/xen/hypercall.h create mode 100644 include/xen/interface/COPYING create mode 100644 include/xen/interface/arch-x86/cpuid.h create mode 100644 include/xen/interface/arch-x86/hvm/save.h create mode 100644 include/xen/interface/arch-x86/xen-mca.h create mode 100644 include/xen/interface/arch-x86/xen-x86_32.h create mode 100644 include/xen/interface/arch-x86/xen-x86_64.h create mode 100644 include/xen/interface/arch-x86/xen.h create mode 100644 include/xen/interface/arch-x86_32.h create mode 100644 include/xen/interface/arch-x86_64.h create mode 100644 include/xen/interface/dom0_ops.h create mode 100644 include/xen/interface/domctl.h create mode 100644 include/xen/interface/hvm/e820.h create mode 100644 include/xen/interface/hvm/hvm_info_table.h create mode 100644 include/xen/interface/hvm/ioreq.h create mode 100644 include/xen/interface/hvm/save.h create mode 100644 include/xen/interface/io/cdromif.h create mode 100644 include/xen/interface/io/fsif.h create mode 100644 include/xen/interface/io/libxenvchan.h create mode 100644 include/xen/interface/io/tpmif.h create mode 100644 include/xen/interface/io/usbif.h create mode 100644 include/xen/interface/io/vscsiif.h create mode 100644 include/xen/interface/kexec.h create mode 100644 include/xen/interface/mem_event.h create mode 100644 include/xen/interface/nmi.h create mode 100644 include/xen/interface/sysctl.h create mode 100644 include/xen/interface/tmem.h create mode 100644 include/xen/interface/trace.h create mode 100644 include/xen/interface/xen-compat.h create mode 100644 include/xen/interface/xenoprof.h create mode 100644 include/xen/interface/xsm/acm.h create mode 100644 include/xen/interface/xsm/acm_ops.h create mode 100644 include/xen/interface/xsm/flask_op.h create mode 100644 include/xen/net-util.h create mode 100644 include/xen/pcifront.h create mode 100644 include/xen/pcpu.h create mode 100644 include/xen/public/Kbuild create mode 100644 include/xen/public/evtchn.h create mode 100644 include/xen/public/gntdev.h create mode 100644 include/xen/public/iomulti.h create mode 100644 include/xen/public/privcmd.h create mode 100644 include/xen/public/xenbus.h create mode 100644 include/xen/sysctl.h create mode 100644 include/xen/xen_proc.h create mode 100644 include/xen/xencons.h create mode 100644 include/xen/xenoprof.h create mode 100644 lib/swiotlb-xen.c create mode 100644 mm/frontswap.c create mode 100644 mm/tmem-xen.c create mode 100644 scripts/Makefile.xen.awk diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cleancache b/Documentation/ABI/testing/sysfs-kernel-mm-cleancache deleted file mode 100644 index 662ae64..0000000 --- a/Documentation/ABI/testing/sysfs-kernel-mm-cleancache +++ /dev/null @@ -1,11 +0,0 @@ -What: /sys/kernel/mm/cleancache/ -Date: April 2011 -Contact: Dan Magenheimer -Description: - /sys/kernel/mm/cleancache/ contains a number of files which - record a count of various cleancache operations - (sum across all filesystems): - succ_gets - failed_gets - puts - flushes diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index fc5b6ba..7d084eb 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -874,6 +874,24 @@ bytes respectively. Such letter suffixes can also be entirely omitted. gpt [EFI] Forces disk with valid GPT signature but invalid Protective MBR to be treated as GPT. + guestdev= [PCI,ACPI,XEN] + Format: {|}][,{|}[,...]] + Format of device path: [:]-.[-.[,...]][+iomul] + Format of sbdf: [:]:.[+iomul] + Specifies PCI device for guest domain. + If PCI-PCI bridge is specified, all PCI devices + behind PCI-PCI bridge are reserved. + +iomul means that this PCI function will share + IO ports with other +iomul functions under same + switch. NOTE: if +iomul is specfied, all the functions + of the device will share IO ports. + + guestiomuldev= [PCI,ACPI,XEN] + Format: [sbd][,][,...] + Format of sbdf: [:]: + Note: function shouldn't be specified. + Specifies PCI device for IO port multiplexing driver. + hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on for 64-bit NUMA, off otherwise. @@ -2114,6 +2132,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. realloc reallocate PCI resources if allocations done by BIOS are erroneous. + pci_reserve= [PCI] + Format: [[+IO][+MEM]][,...] + Format of sbdf: [:]:. + Specifies the least reserved io size or memory size + which is assigned to PCI bridge even when no child + pci device exists. This is useful with PCI hotplug. + pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. off Disable ASPM. @@ -2289,6 +2314,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Run specified binary instead of /init from the ramdisk, used for early userspace startup. See initrd. + reassign_resources [PCI,ACPI,XEN] + Use guestdev= parameter to reassign device's + resources, or specify =all here. + reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode Format: [,[,...]] See arch/*/kernel/reboot.c or arch/*/kernel/process.c diff --git a/Documentation/vm/cleancache.txt b/Documentation/vm/cleancache.txt index 36c367c..850580e 100644 --- a/Documentation/vm/cleancache.txt +++ b/Documentation/vm/cleancache.txt @@ -46,10 +46,11 @@ a negative return value indicates failure. A "put_page" will copy a the pool id, a file key, and a page index into the file. (The combination of a pool id, a file key, and an index is sometimes called a "handle".) A "get_page" will copy the page, if found, from cleancache into kernel memory. -A "flush_page" will ensure the page no longer is present in cleancache; -a "flush_inode" will flush all pages associated with the specified file; -and, when a filesystem is unmounted, a "flush_fs" will flush all pages in -all files specified by the given pool id and also surrender the pool id. +An "invalidate_page" will ensure the page no longer is present in cleancache; +an "invalidate_inode" will invalidate all pages associated with the specified +file; and, when a filesystem is unmounted, an "invalidate_fs" will invalidate +all pages in all files specified by the given pool id and also surrender +the pool id. An "init_shared_fs", like init_fs, obtains a pool id but tells cleancache to treat the pool as shared using a 128-bit UUID as a key. On systems @@ -62,12 +63,12 @@ of the kernel (e.g. by "tools" that control cleancache). Or a cleancache implementation can simply disable shared_init by always returning a negative value. -If a get_page is successful on a non-shared pool, the page is flushed (thus -making cleancache an "exclusive" cache). On a shared pool, the page -is NOT flushed on a successful get_page so that it remains accessible to +If a get_page is successful on a non-shared pool, the page is invalidated +(thus making cleancache an "exclusive" cache). On a shared pool, the page +is NOT invalidated on a successful get_page so that it remains accessible to other sharers. The kernel is responsible for ensuring coherency between cleancache (shared or not), the page cache, and the filesystem, using -cleancache flush operations as required. +cleancache invalidate operations as required. Note that cleancache must enforce put-put-get coherency and get-get coherency. For the former, if two puts are made to the same handle but @@ -77,20 +78,20 @@ if a get for a given handle fails, subsequent gets for that handle will never succeed unless preceded by a successful put with that handle. Last, cleancache provides no SMP serialization guarantees; if two -different Linux threads are simultaneously putting and flushing a page +different Linux threads are simultaneously putting and invalidating a page with the same handle, the results are indeterminate. Callers must lock the page to ensure serial behavior. CLEANCACHE PERFORMANCE METRICS -Cleancache monitoring is done by sysfs files in the -/sys/kernel/mm/cleancache directory. The effectiveness of cleancache +If properly configured, monitoring of cleancache is done via debugfs in +the /sys/kernel/debug/cleancache directory. The effectiveness of cleancache can be measured (across all filesystems) with: succ_gets - number of gets that were successful failed_gets - number of gets that failed puts - number of puts attempted (all "succeed") -flushes - number of flushes attempted +invalidates - number of invalidates attempted A backend implementatation may provide additional metrics. @@ -143,7 +144,7 @@ systems. The core hooks for cleancache in VFS are in most cases a single line and the minimum set are placed precisely where needed to maintain -coherency (via cleancache_flush operations) between cleancache, +coherency (via cleancache_invalidate operations) between cleancache, the page cache, and disk. All hooks compile into nothingness if cleancache is config'ed off and turn into a function-pointer- compare-to-NULL if config'ed on but no backend claims the ops @@ -184,15 +185,15 @@ or for real kernel-addressable RAM, it makes perfect sense for transcendent memory. 4) Why is non-shared cleancache "exclusive"? And where is the - page "flushed" after a "get"? (Minchan Kim) + page "invalidated" after a "get"? (Minchan Kim) The main reason is to free up space in transcendent memory and -to avoid unnecessary cleancache_flush calls. If you want inclusive, +to avoid unnecessary cleancache_invalidate calls. If you want inclusive, the page can be "put" immediately following the "get". If put-after-get for inclusive becomes common, the interface could -be easily extended to add a "get_no_flush" call. +be easily extended to add a "get_no_invalidate" call. -The flush is done by the cleancache backend implementation. +The invalidate is done by the cleancache backend implementation. 5) What's the performance impact? @@ -222,7 +223,7 @@ Some points for a filesystem to consider: as tmpfs should not enable cleancache) - To ensure coherency/correctness, the FS must ensure that all file removal or truncation operations either go through VFS or - add hooks to do the equivalent cleancache "flush" operations + add hooks to do the equivalent cleancache "invalidate" operations - To ensure coherency/correctness, either inode numbers must be unique across the lifetime of the on-disk file OR the FS must provide an "encode_fh" function. @@ -243,11 +244,11 @@ If cleancache would use the inode virtual address instead of inode/filehandle, the pool id could be eliminated. But, this won't work because cleancache retains pagecache data pages persistently even when the inode has been pruned from the -inode unused list, and only flushes the data page if the file +inode unused list, and only invalidates the data page if the file gets removed/truncated. So if cleancache used the inode kva, there would be potential coherency issues if/when the inode kva is reused for a different file. Alternately, if cleancache -flushed the pages when the inode kva was freed, much of the value +invalidated the pages when the inode kva was freed, much of the value of cleancache would be lost because the cache of pages in cleanache is potentially much larger than the kernel pagecache and is most useful if the pages survive inode cache removal. diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt new file mode 100644 index 0000000..5a1a00c --- /dev/null +++ b/Documentation/vm/frontswap.txt @@ -0,0 +1,210 @@ +Frontswap provides a "transcendent memory" interface for swap pages. +In some environments, dramatic performance savings may be obtained because +swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. + +Frontswap is so named because it can be thought of as the opposite of +a "backing" store for a swap device. The storage is assumed to be +a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming +to the requirements of transcendent memory (such as Xen's "tmem", or +in-kernel compressed memory, aka "zcache", or future RAM-like devices); +this pseudo-RAM device is not directly accessible or addressable by the +kernel and is of unknown and possibly time-varying size. The driver +links itself to frontswap by calling frontswap_register_ops to set the +frontswap_ops funcs appropriately and the functions it provides must +conform to certain policies as follows: + +An "init" prepares the device to receive frontswap pages associated +with the specified swap device number (aka "type"). A "put_page" will +copy the page to transcendent memory and associate it with the type and +offset associated with the page. A "get_page" will copy the page, if found, +from transcendent memory into kernel memory, but will NOT remove the page +from from transcendent memory. An "invalidate_page" will remove the page +from transcendent memory and an "invalidate_area" will remove ALL pages +associated with the swap type (e.g., like swapoff) and notify the "device" +to refuse further puts with that swap type. + +Once a page is successfully put, a matching get on the page will normally +succeed. So when the kernel finds itself in a situation where it needs +to swap out a page, it first attempts to use frontswap. If the put returns +success, the data has been successfully saved to transcendent memory and +a disk write and, if the data is later read back, a disk read are avoided. +If a put returns failure, transcendent memory has rejected the data, and the +page can be written to swap as usual. + +Note that if a page is put and the page already exists in transcendent memory +(a "duplicate" put), either the put succeeds and the data is overwritten, +or the put fails AND the page is invalidated. This ensures stale data may +never be obtained from frontswap. + +If properly configured, monitoring of frontswap is done via debugfs in +the /sys/kernel/debug/frontswap directory. The effectiveness of +frontswap can be measured (across all swap devices) with: + +failed_puts - how many put attempts have failed +gets - how many gets were attempted (all should succeed) +succ_puts - how many put attempts have succeeded +invalidates - how many invalidates were attempted + +A backend implementation may provide additional metrics. + +FAQ + +1) Where's the value? + +When a workload starts swapping, performance falls through the floor. +Frontswap significantly increases performance in many such workloads by +providing a clean, dynamic interface to read and write swap pages to +"transcendent memory" that is otherwise not directly addressable to the kernel. +This interface is ideal when data is transformed to a different form +and size (such as with compression) or secretly moved (as might be +useful for write-balancing for some RAM-like devices). Swap pages (and +evicted page-cache pages) are a great use for this kind of slower-than-RAM- +but-much-faster-than-disk "pseudo-RAM device" and the frontswap (and +cleancache) interface to transcendent memory provides a nice way to read +and write -- and indirectly "name" -- the pages. + +In the virtual case, the whole point of virtualization is to statistically +multiplex physical resources acrosst the varying demands of multiple +virtual machines. This is really hard to do with RAM and efforts to do +it well with no kernel changes have essentially failed (except in some +well-publicized special-case workloads). Frontswap -- and cleancache -- +with a fairly small impact on the kernel, provides a huge amount +of flexibility for more dynamic, flexible RAM multiplexing. +Specifically, the Xen Transcendent Memory backend allows otherwise +"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple +virtual machines, but the pages can be compressed and deduplicated to +optimize RAM utilization. And when guest OS's are induced to surrender +underutilized RAM (e.g. with "self-ballooning"), sudden unexpected +memory pressure may result in swapping; frontswap allows those pages +to be swapped to and from hypervisor RAM if overall host system memory +conditions allow. + +2) Sure there may be performance advantages in some situations, but + what's the space/time overhead of frontswap? + +If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into +nothingness and the only overhead is a few extra bytes per swapon'ed +swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" +registers, there is one extra global variable compared to zero for +every swap page read or written. If CONFIG_FRONTSWAP is enabled +AND a frontswap backend registers AND the backend fails every "put" +request (i.e. provides no memory despite claiming it might), +CPU overhead is still negligible -- and since every frontswap fail +precedes a swap page write-to-disk, the system is highly likely +to be I/O bound and using a small fraction of a percent of a CPU +will be irrelevant anyway. + +As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend +registers, one bit is allocated for every swap page for every swap +device that is swapon'd. This is added to the EIGHT bits (which +was sixteen until about 2.6.34) that the kernel already allocates +for every swap page for every swap device that is swapon'd. (Hugh +Dickins has observed that frontswap could probably steal one of +the existing eight bits, but let's worry about that minor optimization +later.) For very large swap disks (which are rare) on a standard +4K pagesize, this is 1MB per 32GB swap. + +3) OK, how about a quick overview of what this frontswap patch does + in terms that a kernel hacker can grok? + +Let's assume that a frontswap "backend" has registered during +kernel initialization; this registration indicates that this +frontswap backend has access to some "memory" that is not directly +accessible by the kernel. Exactly how much memory it provides is +entirely dynamic and random. + +Whenever a swap-device is swapon'd frontswap_init() is called, +passing the swap device number (aka "type") as a parameter. +This notifies frontswap to expect attempts to "put" swap pages +associated with that number. + +Whenever the swap subsystem is readying a page to write to a swap +device (c.f swap_writepage()), frontswap_put_page is called. Frontswap +consults with the frontswap backend and if the backend says it does NOT +have room, frontswap_put_page returns -1 and the kernel swaps the page +to the swap device as normal. Note that the response from the frontswap +backend is unpredictable to the kernel; it may choose to never accept a +page, it could accept every ninth page, or it might accept every +page. But if the backend does accept a page, the data from the page +has already been copied and associated with the type and offset, +and the backend guarantees the persistence of the data. In this case, +frontswap sets a bit in the "frontswap_map" for the swap device +corresponding to the page offset on the swap device to which it would +otherwise have written the data. + +When the swap subsystem needs to swap-in a page (swap_readpage()), +it first calls frontswap_get_page() which checks the frontswap_map to +see if the page was earlier accepted by the frontswap backend. If +it was, the page of data is filled from the frontswap backend and +the swap-in is complete. If not, the normal swap-in code is +executed to obtain the page of data from the real swap device. + +So every time the frontswap backend accepts a page, a swap device read +and (potentially) a swap device write are replaced by a "frontswap backend +put" and (possibly) a "frontswap backend get", which are presumably much +faster. + +4) Can't frontswap be configured as a "special" swap device that is + just higher priority than any real swap device (e.g. like zswap)? + +No. Recall that acceptance of any swap page by the frontswap +backend is entirely unpredictable. This is critical to the definition +of frontswap because it grants completely dynamic discretion to the +backend. But since any "put" might fail, there must always be a real +slot on a real swap device to swap the page. Thus frontswap must be +implemented as a "shadow" to every swapon'd device with the potential +capability of holding every page that the swap device might have held +and the possibility that it might hold no pages at all. +On the downside, this also means that frontswap cannot contain more +pages than the total of swapon'd swap devices. For example, if NO +swap device is configured on some installation, frontswap is useless. + +Further, frontswap is entirely synchronous whereas a real swap +device is, by definition, asynchronous and uses block I/O. The +block I/O layer is not only unnecessary, but may perform "optimizations" +that are inappropriate for a RAM-oriented device including delaying +the write of some pages for a significant amount of time. Synchrony is +required to ensure the dynamicity of the backend and to avoid thorny race +conditions that would unnecessarily and greatly complicate frontswap +and/or the block I/O subsystem. + +In a virtualized environment, the dynamicity allows the hypervisor +(or host OS) to do "intelligent overcommit". For example, it can +choose to accept pages only until host-swapping might be imminent, +then force guests to do their own swapping. In zcache, "poorly" +compressible pages can be rejected, where "poorly" can itself be defined +dynamically depending on current memory constraints. + +5) Why this weird definition about "duplicate puts"? If a page + has been previously successfully put, can't it always be + successfully overwritten? + +Nearly always it can, but no, sometimes it cannot. Consider an example +where data is compressed and the original 4K page has been compressed +to 1K. Now an attempt is made to overwrite the page with data that +is non-compressible and so would take the entire 4K. But the backend +has no more space. In this case, the put must be rejected. Whenever +frontswap rejects a put that would overwrite, it also must invalidate +the old data and ensure that it is no longer accessible. Since the +swap subsystem then writes the new data to the read swap device, +this is the correct course of action to ensure coherency. + +6) What is frontswap_shrink for? + +When the (non-frontswap) swap subsystem swaps out a page to a real +swap device, that page is only taking up low-value pre-allocated disk +space. But if frontswap has placed a page in transcendent memory, that +page may be taking up valuable real estate. The frontswap_shrink +routine allows code outside of the swap subsystem (such as Xen tmem +or zcache or some future tmem backend) to force pages out of the memory +managed by frontswap and back into kernel-addressable memory. + +7) Why does the frontswap patch create the new include file swapfile.h? + +The frontswap code depends on some swap-subsystem-internal data +structures that have, over the years, moved back and forth between +static and global. This seemed a reasonable compromise: Define +them as global but declare them in a new include file that isn't +included by the large number of source files that include swap.h. + +Dan Magenheimer, last updated September 12, 2011 diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 9b32d0e1..12a0e6f 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -234,7 +234,7 @@ config IA64_HP_SIM config IA64_XEN_GUEST bool "Xen guest" select SWIOTLB - depends on XEN + depends on PARAVIRT_XEN help Build a kernel that runs on Xen guest domain. At this moment only 16KB page size in supported. diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index be7bfa1..342907d 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -55,7 +55,7 @@ core-$(CONFIG_IA64_XEN_GUEST) += arch/ia64/dig/ core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/ core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/ core-$(CONFIG_KVM) += arch/ia64/kvm/ -core-$(CONFIG_XEN) += arch/ia64/xen/ +core-$(CONFIG_PARAVIRT_XEN) += arch/ia64/xen/ drivers-$(CONFIG_PCI) += arch/ia64/pci/ drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/ diff --git a/arch/ia64/include/asm/xen/hypervisor.h b/arch/ia64/include/asm/xen/hypervisor.h index 67455c2..aacad12 100644 --- a/arch/ia64/include/asm/xen/hypervisor.h +++ b/arch/ia64/include/asm/xen/hypervisor.h @@ -34,13 +34,13 @@ #define _ASM_IA64_XEN_HYPERVISOR_H #include +#include +#ifdef CONFIG_PARAVIRT_XEN #include #include /* to compile feature.c */ #include /* to comiple xen-netfront.c */ -#include #include -#ifdef CONFIG_XEN extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h index fbb5198..d950667 100644 --- a/arch/ia64/include/asm/xen/interface.h +++ b/arch/ia64/include/asm/xen/interface.h @@ -56,30 +56,19 @@ #ifndef _ASM_IA64_XEN_INTERFACE_H #define _ASM_IA64_XEN_INTERFACE_H -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name -#define GUEST_HANDLE_64(name) GUEST_HANDLE(name) + __DEFINE_XEN_GUEST_HANDLE(name, struct name) +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name +#define XEN_GUEST_HANDLE_64(name) XEN_GUEST_HANDLE(name) #define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0) #ifndef __ASSEMBLY__ -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); - -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); -DEFINE_GUEST_HANDLE(void); -DEFINE_GUEST_HANDLE(uint64_t); - +typedef unsigned long xen_ulong_t; typedef unsigned long xen_pfn_t; -DEFINE_GUEST_HANDLE(xen_pfn_t); #define PRI_xen_pfn "lx" #endif @@ -91,7 +80,7 @@ DEFINE_GUEST_HANDLE(xen_pfn_t); /* Maximum number of virtual CPUs in multi-processor guests. */ /* keep sizeof(struct shared_page) <= PAGE_SIZE. * this is checked in arch/ia64/xen/hypervisor.c. */ -#define MAX_VIRT_CPUS 64 +#define XEN_LEGACY_MAX_VCPUS 64 #ifndef __ASSEMBLY__ diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c index af56501..166ced4 100644 --- a/arch/ia64/kernel/asm-offsets.c +++ b/arch/ia64/kernel/asm-offsets.c @@ -290,7 +290,7 @@ void foo(void) DEFINE(IA64_ITC_LASTCYCLE_OFFSET, offsetof (struct itc_jitter_data_t, itc_lastcycle)); -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN BLANK(); DEFINE(XEN_NATIVE_ASM, XEN_NATIVE); diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 53c0ba0..ddde313 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -183,7 +183,7 @@ SECTIONS { __start_gate_section = .; *(.data..gate) __stop_gate_section = .; -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN . = ALIGN(PAGE_SIZE); __xen_start_gate_section = .; *(.data..gate.xen) diff --git a/arch/ia64/xen/Kconfig b/arch/ia64/xen/Kconfig index 515e082..14d8ac6 100644 --- a/arch/ia64/xen/Kconfig +++ b/arch/ia64/xen/Kconfig @@ -2,7 +2,7 @@ # This Kconfig describes xen/ia64 options # -config XEN +config PARAVIRT_XEN bool "Xen hypervisor support" default y depends on PARAVIRT && MCKINLEY && IA64_PAGE_SIZE_16KB && EXPERIMENTAL @@ -16,10 +16,6 @@ config XEN Enable Xen hypervisor support. Resulting kernel runs both as a guest OS on Xen and natively on hardware. -config XEN_XENCOMM - depends on XEN - bool - config NO_IDLE_HZ - depends on XEN + depends on PARAVIRT_XEN bool diff --git a/arch/ia64/xen/xcom_hcall.c b/arch/ia64/xen/xcom_hcall.c index ccaf743..7690fc3 100644 --- a/arch/ia64/xen/xcom_hcall.c +++ b/arch/ia64/xen/xcom_hcall.c @@ -343,7 +343,7 @@ xencommize_memory_reservation(struct xencomm_mini *xc_area, int xencomm_hypercall_memory_op(unsigned int cmd, void *arg) { - GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} }; + XEN_GUEST_HANDLE(xen_pfn_t) extent_start_va[2] = { {NULL}, {NULL} }; struct xen_memory_reservation *xmr = NULL; int rc; struct xencomm_handle *desc; diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 0e9dec6..006ed9e 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -2,7 +2,7 @@ obj-$(CONFIG_KVM) += kvm/ # Xen paravirtualization support -obj-$(CONFIG_XEN) += xen/ +obj-$(CONFIG_PARAVIRT_XEN) += xen/ # lguest paravirtualization support obj-$(CONFIG_LGUEST_GUEST) += lguest/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ffcb80b..6e8abc5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -8,7 +8,7 @@ config 64BIT config X86_32 def_bool !64BIT - select CLKSRC_I8253 + select CLKSRC_I8253 if !XEN config X86_64 def_bool 64BIT @@ -20,7 +20,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PCSPKR_PLATFORM + select HAVE_PCSPKR_PLATFORM if !XEN_UNPRIVILEGED_GUEST select HAVE_PERF_EVENTS select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT @@ -42,8 +42,8 @@ config X86 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select HAVE_SYSCALL_TRACEPOINTS - select HAVE_KVM - select HAVE_ARCH_KGDB + select HAVE_KVM if !XEN + select HAVE_ARCH_KGDB if !XEN select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -51,14 +51,14 @@ config X86 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_DMA_API_DEBUG select HAVE_KERNEL_GZIP - select HAVE_KERNEL_BZIP2 - select HAVE_KERNEL_LZMA - select HAVE_KERNEL_XZ - select HAVE_KERNEL_LZO + select HAVE_KERNEL_BZIP2 if !XEN + select HAVE_KERNEL_LZMA if !XEN + select HAVE_KERNEL_XZ if !XEN + select HAVE_KERNEL_LZO if !XEN select HAVE_HW_BREAKPOINT select HAVE_MIXED_BREAKPOINTS_REGS select PERF_EVENTS - select HAVE_PERF_EVENTS_NMI + select HAVE_PERF_EVENTS_NMI if !XEN select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 @@ -79,7 +79,7 @@ config X86 select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if (X86_64 && NET) - select CLKEVT_I8253 + select CLKEVT_I8253 if !XEN select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP @@ -101,17 +101,19 @@ config GENERIC_CMOS_UPDATE config CLOCKSOURCE_WATCHDOG def_bool y + depends on !XEN config GENERIC_CLOCKEVENTS def_bool y config ARCH_CLOCKSOURCE_DATA def_bool y - depends on X86_64 + depends on X86_64 && !XEN config GENERIC_CLOCKEVENTS_BROADCAST def_bool y depends on X86_64 || (X86_32 && X86_LOCAL_APIC) + depends on !XEN config LOCKDEP_SUPPORT def_bool y @@ -129,7 +131,7 @@ config SBUS bool config NEED_DMA_MAP_STATE - def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG) + def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB) config NEED_SG_DMA_LENGTH def_bool y @@ -193,6 +195,7 @@ config NEED_PER_CPU_PAGE_FIRST_CHUNK config ARCH_HIBERNATION_POSSIBLE def_bool y + depends on !XEN config ARCH_SUSPEND_POSSIBLE def_bool y @@ -225,7 +228,15 @@ config X86_64_SMP config X86_HT def_bool y - depends on SMP + depends on SMP && !XEN + +config X86_NO_TSS + def_bool y + depends on XEN + +config X86_NO_IDT + def_bool y + depends on XEN config X86_32_LAZY_GS def_bool y @@ -241,7 +252,7 @@ config KTIME_SCALAR config ARCH_CPU_PROBE_RELEASE def_bool y - depends on HOTPLUG_CPU + depends on HOTPLUG_CPU && !XEN source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -307,13 +318,22 @@ config X86_MPPARSE For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it +config X86_XEN + bool "Xen-compatible" + depends on X86_32 + select XEN + select X86_PAE + help + Choose this option if you plan to run this kernel on top of the + Xen Hypervisor. + config X86_BIGSMP bool "Support for big SMP systems with more than 8 CPUs" - depends on X86_32 && SMP + depends on X86_32 && SMP && !XEN ---help--- This option is needed for the systems that have more than 8 CPUs -if X86_32 +if X86_32 && !XEN config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -336,7 +356,14 @@ config X86_EXTENDED_PLATFORM generic distribution kernel, say Y here - otherwise say N. endif -if X86_64 +config X86_64_XEN + bool "Enable Xen compatible kernel" + depends on X86_64 + select XEN + help + This option will compile a kernel compatible with Xen hypervisor + +if X86_64 && !XEN config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -538,7 +565,7 @@ config X86_ES7000 config X86_32_IRIS tristate "Eurobraille/Iris poweroff module" - depends on X86_32 + depends on X86_32 && !XEN ---help--- The Iris machines from EuroBraille do not have APM or ACPI support to shut themselves down properly. A special I/O sequence is @@ -563,6 +590,7 @@ config SCHED_OMIT_FRAME_POINTER menuconfig PARAVIRT_GUEST bool "Paravirtualized guest support" + depends on !XEN ---help--- Say Y here to get to see options related to running Linux under various hypervisors. This option alone does not add any kernel code. @@ -643,6 +671,7 @@ config NO_BOOTMEM config MEMTEST bool "Memtest" + depends on !XEN ---help--- This option adds a kernel parameter 'memtest', which allows memtest to be set. @@ -665,6 +694,7 @@ source "arch/x86/Kconfig.cpu" config HPET_TIMER def_bool X86_64 prompt "HPET Timer Support" if X86_32 + depends on !XEN ---help--- Use the IA-PC HPET (High Precision Event Timer) to manage time in preference to the PIT and RTC, if a HPET is @@ -702,6 +732,7 @@ config APB_TIMER config DMI default y bool "Enable DMI scanning" if EXPERT + depends on !XEN_UNPRIVILEGED_GUEST ---help--- Enabled scanning of DMI to identify machine quirks. Say Y here unless you have verified that your setup is not @@ -712,7 +743,7 @@ config GART_IOMMU bool "GART IOMMU support" if EXPERT default y select SWIOTLB - depends on X86_64 && PCI && AMD_NB + depends on X86_64 && PCI && AMD_NB && !X86_64_XEN ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -727,7 +758,7 @@ config GART_IOMMU config CALGARY_IOMMU bool "IBM Calgary IOMMU support" select SWIOTLB - depends on X86_64 && PCI && EXPERIMENTAL + depends on X86_64 && PCI && !X86_64_XEN && EXPERIMENTAL ---help--- Support for hardware IOMMUs in IBM's xSeries x366 and x460 systems. Needed to run systems with more than 3GB of memory @@ -755,7 +786,8 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT # need this always selected by IOMMU for the VIA workaround config SWIOTLB - def_bool y if X86_64 + def_bool y if X86_64 || XEN + prompt "Software I/O TLB" if XEN_UNPRIVILEGED_GUEST && !XEN_PCIDEV_FRONTEND ---help--- Support for software bounce buffers used on x86-64 systems which don't have a hardware IOMMU (e.g. the current generation @@ -776,11 +808,12 @@ config MAXSMP config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP - range 2 8 if SMP && X86_32 && !X86_BIGSMP + range 2 8 if SMP && X86_32 && !X86_BIGSMP && !X86_XEN range 2 512 if SMP && !MAXSMP default "1" if !SMP default "4096" if MAXSMP default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000) + default "16" if X86_64_XEN default "8" if SMP ---help--- This allows you to specify the maximum number of CPUs which this @@ -823,7 +856,7 @@ source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !XEN_UNPRIVILEGED_GUEST ---help--- A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -849,10 +882,12 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on !XEN_UNPRIVILEGED_GUEST config X86_IO_APIC def_bool y depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC + depends on !XEN_UNPRIVILEGED_GUEST config X86_VISWS_APIC def_bool y @@ -860,7 +895,7 @@ config X86_VISWS_APIC config X86_REROUTE_FOR_BROKEN_BOOT_IRQS bool "Reroute for broken boot IRQs" - depends on X86_IO_APIC + depends on X86_IO_APIC && !XEN ---help--- This option enables a workaround that fixes a source of spurious interrupts. This is recommended when threaded @@ -883,6 +918,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS config X86_MCE bool "Machine Check / overheating reporting" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- Machine Check support allows the processor to notify the kernel if it detects a problem (e.g. overheating, data corruption). @@ -892,7 +928,7 @@ config X86_MCE config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && !XEN ---help--- Additional support for intel specific MCE features such as the thermal monitor. @@ -900,14 +936,14 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && !XEN ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. config X86_ANCIENT_MCE bool "Support for old Pentium 5 / WinChip machine checks" - depends on X86_32 && X86_MCE + depends on X86_32 && X86_MCE && !XEN ---help--- Include support for machine check handling on old Pentium 5 or WinChip systems. These typically need to be enabled explicitely on the command @@ -925,6 +961,10 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. +config X86_XEN_MCE + def_bool y + depends on XEN && X86_MCE + config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL @@ -978,7 +1018,7 @@ config I8K config X86_REBOOTFIXUPS bool "Enable X86 board specific fixups for reboot" - depends on X86_32 + depends on X86_32 && !XEN ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -995,6 +1035,7 @@ config X86_REBOOTFIXUPS config MICROCODE tristate "/dev/cpu/microcode - microcode support" + depends on !XEN_UNPRIVILEGED_GUEST select FW_LOADER ---help--- If you say Y here, you will be able to update the microcode on @@ -1013,7 +1054,7 @@ config MICROCODE config MICROCODE_INTEL bool "Intel microcode patch loading support" - depends on MICROCODE + depends on MICROCODE && !XEN default MICROCODE select FW_LOADER ---help--- @@ -1026,7 +1067,7 @@ config MICROCODE_INTEL config MICROCODE_AMD bool "AMD microcode patch loading support" - depends on MICROCODE + depends on MICROCODE && !XEN select FW_LOADER ---help--- If you select this option, microcode patch loading support for AMD @@ -1038,6 +1079,7 @@ config MICROCODE_OLD_INTERFACE config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" + select XEN_DOMCTL if XEN_PRIVILEGED_GUEST ---help--- This device gives privileged processes access to the x86 Model-Specific Registers (MSRs). It is a character device with @@ -1055,7 +1097,7 @@ config X86_CPUID choice prompt "High Memory Support" - default HIGHMEM64G if X86_NUMAQ + default HIGHMEM64G if X86_NUMAQ || XEN default HIGHMEM4G depends on X86_32 @@ -1098,7 +1140,7 @@ config NOHIGHMEM config HIGHMEM4G bool "4GB" - depends on !X86_NUMAQ + depends on !X86_NUMAQ && !XEN ---help--- Select this if you have a 32-bit processor and between 1 and 4 gigabytes of physical RAM. @@ -1174,12 +1216,12 @@ config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE config ARCH_DMA_ADDR_T_64BIT - def_bool X86_64 || HIGHMEM64G + def_bool X86_64 || XEN || HIGHMEM64G config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EXPERT default y - depends on X86_64 + depends on X86_64 && !XEN ---help--- Allow the kernel linear mapping to use 1GB pages on CPUs that support it. This can improve the kernel's performance a tiny bit by @@ -1188,7 +1230,7 @@ config DIRECT_GBPAGES # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" - depends on SMP + depends on SMP && !XEN depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) ---help--- @@ -1289,12 +1331,13 @@ config ARCH_DISCONTIGMEM_DEFAULT config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD + depends on !XEN select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on X86_64 + depends on X86_64 && !X86_64_XEN config ARCH_SELECT_MEMORY_MODEL def_bool y @@ -1326,6 +1369,7 @@ config HIGHPTE config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" + depends on !XEN ---help--- Periodically check for memory corruption in low memory, which is suspected to be caused by BIOS. Even when enabled in the @@ -1356,6 +1400,7 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK config X86_RESERVE_LOW int "Amount of low memory, in kilobytes, to reserve for the BIOS" + depends on !XEN default 64 range 4 640 ---help--- @@ -1386,6 +1431,7 @@ config X86_RESERVE_LOW config MATH_EMULATION bool prompt "Math emulation" if X86_32 + depends on !XEN ---help--- Linux can emulate a math coprocessor (used for floating point operations) if you don't have one. 486DX and Pentium processors have @@ -1412,6 +1458,7 @@ config MATH_EMULATION config MTRR def_bool y prompt "MTRR (Memory Type Range Register) support" if EXPERT + depends on !XEN_UNPRIVILEGED_GUEST ---help--- On Intel P6 family processors (Pentium Pro, Pentium II and later) the Memory Type Range Registers (MTRRs) may be used to control @@ -1447,7 +1494,7 @@ config MTRR config MTRR_SANITIZER def_bool y prompt "MTRR cleanup support" - depends on MTRR + depends on MTRR && !XEN ---help--- Convert MTRR layout from continuous to discrete, so X drivers can add writeback entries. @@ -1477,8 +1524,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT config X86_PAT def_bool y - prompt "x86 PAT support" if EXPERT - depends on MTRR + prompt "x86 PAT support" if EXPERT || XEN_UNPRIVILEGED_GUEST + depends on MTRR || (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND) ---help--- Use PAT attributes to setup page level cache control. @@ -1505,7 +1552,7 @@ config ARCH_RANDOM config EFI bool "EFI runtime service support" - depends on ACPI + depends on ACPI && !XEN_UNPRIVILEGED_GUEST ---help--- This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). @@ -1519,7 +1566,7 @@ config EFI config EFI_STUB bool "EFI stub support" - depends on EFI + depends on EFI && !XEN ---help--- This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. @@ -1560,6 +1607,7 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" + depends on !XEN_UNPRIVILEGED_GUEST ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot @@ -1577,6 +1625,7 @@ config KEXEC config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) + depends on !XEN ---help--- Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels @@ -1597,7 +1646,8 @@ config KEXEC_JUMP code in physical address mode via KEXEC config PHYSICAL_START - hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP) + hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP || XEN) + default 0x100000 if XEN default "0x1000000" ---help--- This gives the physical address where the kernel is loaded. @@ -1639,6 +1689,7 @@ config PHYSICAL_START config RELOCATABLE bool "Build a relocatable kernel" + depends on !XEN default y ---help--- This builds a kernel image that retains relocation information @@ -1660,7 +1711,8 @@ config X86_NEED_RELOCS depends on X86_32 && RELOCATABLE config PHYSICAL_ALIGN - hex "Alignment value to which kernel should be aligned" if X86_32 + hex "Alignment value to which kernel should be aligned" if X86_32 && !XEN + default 0x2000 if XEN default "0x1000000" range 0x2000 0x1000000 ---help--- @@ -1753,6 +1805,7 @@ endmenu config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) + depends on !XEN config ARCH_ENABLE_MEMORY_HOTREMOVE def_bool y @@ -1770,6 +1823,8 @@ config ARCH_HIBERNATION_HEADER source "kernel/power/Kconfig" +if !XEN_UNPRIVILEGED_GUEST + source "drivers/acpi/Kconfig" source "drivers/sfi/Kconfig" @@ -1780,7 +1835,7 @@ config X86_APM_BOOT menuconfig APM tristate "APM (Advanced Power Management) BIOS support" - depends on X86_32 && PM_SLEEP + depends on X86_32 && PM_SLEEP && !XEN ---help--- APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with @@ -1905,6 +1960,8 @@ source "drivers/cpuidle/Kconfig" source "drivers/idle/Kconfig" +endif # !XEN_UNPRIVILEGED_GUEST + endmenu @@ -1914,6 +1971,7 @@ config PCI bool "PCI support" default y select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) + select ARCH_SUPPORTS_MSI if (XEN_UNPRIVILEGED_GUEST && XEN_PCIDEV_FRONTEND) ---help--- Find out whether you have a PCI motherboard. PCI is the name of a bus system, i.e. the way the CPU talks to the other stuff inside @@ -1941,25 +1999,36 @@ choice config PCI_GOBIOS bool "BIOS" + depends on !XEN config PCI_GOMMCONFIG bool "MMConfig" + depends on !XEN_UNPRIVILEGED_GUEST config PCI_GODIRECT bool "Direct" + depends on !XEN_UNPRIVILEGED_GUEST config PCI_GOOLPC bool "OLPC XO-1" - depends on OLPC + depends on OLPC && !XEN_UNPRIVILEGED_GUEST + +config PCI_GOXEN_FE + bool "Xen PCI Frontend" + depends on X86_XEN + help + The PCI device frontend driver allows the kernel to import arbitrary + PCI devices from a PCI backend to support PCI driver domains. config PCI_GOANY bool "Any" + depends on !XEN_UNPRIVILEGED_GUEST endchoice config PCI_BIOS def_bool y - depends on X86_32 && PCI && (PCI_GOBIOS || PCI_GOANY) + depends on X86_32 && PCI && !XEN && (PCI_GOBIOS || PCI_GOANY) # x86-64 doesn't support PCI BIOS access from long mode so always go direct. config PCI_DIRECT @@ -1976,7 +2045,7 @@ config PCI_OLPC config PCI_XEN def_bool y - depends on PCI && XEN + depends on PCI && PARAVIRT_XEN select SWIOTLB_XEN config PCI_DOMAINS @@ -2007,7 +2076,7 @@ source "drivers/pci/Kconfig" # x86_64 have no ISA slots, but can have ISA-style DMA. config ISA_DMA_API - bool "ISA-style DMA support" if (X86_64 && EXPERT) + bool "ISA-style DMA support" if ((X86_64 || XEN) && EXPERT) || XEN_UNPRIVILEGED_GUEST default y help Enables ISA-style DMA support for devices requiring such controllers. @@ -2017,6 +2086,7 @@ if X86_32 config ISA bool "ISA support" + depends on !XEN ---help--- Find out whether you have ISA slots on your motherboard. ISA is the name of a bus system, i.e. the way the CPU talks to the other stuff @@ -2044,6 +2114,7 @@ source "drivers/eisa/Kconfig" config MCA bool "MCA support" + depends on !XEN ---help--- MicroChannel Architecture is found in some IBM PS/2 machines and laptops. It is a bus system similar to PCI or ISA. See @@ -2075,7 +2146,7 @@ config SCx200HR_TIMER config OLPC bool "One Laptop Per Child support" - depends on !X86_PAE + depends on !X86_PAE && !XEN select GPIOLIB select OF select OF_PROMTREE @@ -2140,7 +2211,7 @@ endif # X86_32 config AMD_NB def_bool y - depends on CPU_SUP_AMD && PCI + depends on CPU_SUP_AMD && PCI && !XEN_UNPRIVILEGED_GUEST source "drivers/pcmcia/Kconfig" @@ -2210,7 +2281,9 @@ source "net/Kconfig" source "drivers/Kconfig" +if !XEN_UNPRIVILEGED_GUEST source "drivers/firmware/Kconfig" +endif source "fs/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 3c57033..028a6cf 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -6,7 +6,7 @@ choice config M386 bool "386" - depends on X86_32 && !UML + depends on X86_32 && !UML && !XEN ---help--- This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel that can run on @@ -47,7 +47,7 @@ config M386 config M486 bool "486" - depends on X86_32 + depends on X86_32 && !XEN ---help--- Select this for a 486 series processor, either Intel or one of the compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, @@ -56,7 +56,7 @@ config M486 config M586 bool "586/K5/5x86/6x86/6x86MX" - depends on X86_32 + depends on X86_32 && !XEN ---help--- Select this for an 586 or 686 series processor such as the AMD K5, the Cyrix 5x86, 6x86 and 6x86MX. This choice does not @@ -64,14 +64,14 @@ config M586 config M586TSC bool "Pentium-Classic" - depends on X86_32 + depends on X86_32 && !XEN ---help--- Select this for a Pentium Classic processor with the RDTSC (Read Time Stamp Counter) instruction for benchmarking. config M586MMX bool "Pentium-MMX" - depends on X86_32 + depends on X86_32 && !XEN ---help--- Select this for a Pentium with the MMX graphics/multimedia extended instructions. @@ -396,6 +396,7 @@ config X86_P6_NOP config X86_TSC def_bool y depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) && !X86_NUMAQ) || X86_64 + depends on !XEN config X86_CMPXCHG64 def_bool y @@ -441,7 +442,7 @@ config CPU_SUP_INTEL config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT - depends on !64BIT + depends on !64BIT && !XEN ---help--- This enables detection, tunings and quirks for Cyrix processors @@ -495,7 +496,7 @@ config CPU_SUP_TRANSMETA_32 config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on !64BIT + depends on !64BIT && !XEN ---help--- This enables detection, tunings and quirks for UMC processors diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index e46c214..2b27700 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -25,6 +25,7 @@ config STRICT_DEVMEM config X86_VERBOSE_BOOTUP bool "Enable verbose x86 bootup info messages" default y + depends on !XEN ---help--- Enables the informational output from the decompression stage (e.g. bzImage) of the boot. If you disable this you will still @@ -32,6 +33,7 @@ config X86_VERBOSE_BOOTUP config EARLY_PRINTK bool "Early printk" if EXPERT + depends on !XEN_UNPRIVILEGED_GUEST default y ---help--- Write kernel log output directly into the VGA buffer or to a serial @@ -122,7 +124,7 @@ config DEBUG_NX_TEST config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EXPERT - depends on X86_32 + depends on X86_32 && !X86_NO_TSS ---help--- This option allows trapping of rare doublefault exceptions that would otherwise cause a system to silently reboot. Disabling this @@ -162,6 +164,7 @@ config IOMMU_LEAK config HAVE_MMIOTRACE_SUPPORT def_bool y + depends on !XEN config X86_DECODER_SELFTEST bool "x86 instruction decoder selftest" @@ -250,6 +253,7 @@ config DEBUG_BOOT_PARAMS bool "Debug boot parameters" depends on DEBUG_KERNEL depends on DEBUG_FS + depends on !XEN ---help--- This option will cause struct boot_params to be exported via debugfs. diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a2bfff7..67238b0 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -157,8 +157,27 @@ boot := arch/x86/boot BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage -PHONY += bzImage $(BOOT_TARGETS) +PHONY += bzImage vmlinuz $(BOOT_TARGETS) +ifdef CONFIG_XEN +LINUXINCLUDE := -D__XEN_INTERFACE_VERSION__=$(CONFIG_XEN_INTERFACE_VERSION) \ + -I$(srctree)/arch/x86/include/mach-xen $(LINUXINCLUDE) + +ifdef CONFIG_X86_64 +LDFLAGS_vmlinux := -e startup_64 +endif + +# Default kernel to build +all: vmlinuz + +# KBUILD_IMAGE specifies the target image being built +KBUILD_IMAGE := $(boot)/vmlinuz + +vmlinuz: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) + $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot + $(Q)ln -fsn ../../x86/boot/$@ $(objtree)/arch/$(UTS_MACHINE)/boot/$@ +else # Default kernel to build all: bzImage @@ -172,6 +191,7 @@ endif $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ +endif $(BOOT_TARGETS): vmlinux $(Q)$(MAKE) $(build)=$(boot) $@ diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 95365a8..3134798 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -17,6 +17,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf bzImage +targets += vmlinuz vmlinux-stripped targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed @@ -188,6 +189,20 @@ bzlilo: $(obj)/bzImage cp System.map $(INSTALL_PATH)/ if [ -x /sbin/lilo ]; then /sbin/lilo; else /etc/lilo/install; fi +$(obj)/vmlinuz: $(obj)/vmlinux-stripped FORCE + $(call if_changed,gzip) + @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' + +$(obj)/vmlinux-stripped: OBJCOPYFLAGS := -g --strip-unneeded +$(obj)/vmlinux-stripped: vmlinux FORCE + $(call if_changed,objcopy) + +ifndef CONFIG_XEN +bzImage := bzImage +else +bzImage := vmlinuz +endif + install: - sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/bzImage \ + sh $(srctree)/$(src)/install.sh $(KERNELRELEASE) $(obj)/$(bzImage) \ System.map "$(INSTALL_PATH)" diff --git a/arch/x86/ia32/ia32entry-xen.S b/arch/x86/ia32/ia32entry-xen.S new file mode 100644 index 0000000..dce835b --- /dev/null +++ b/arch/x86/ia32/ia32entry-xen.S @@ -0,0 +1,383 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + + .section .entry.text, "ax" + + .macro IA32_ARG_FIXUP noebp=0 + movl %edi,%r8d + .if \noebp + jmp .Lia32_common + .else + movl %ebp,%r9d +.Lia32_common: + .endif + xchg %ecx,%esi + movl %ebx,%edi + movl %edx,%edx /* zero extension */ + .endm + + /* clobbers %eax */ + .macro CLEAR_RREGS offset=0, _r9=rax + xorl %eax,%eax + movq %rax,\offset+R11(%rsp) + movq %rax,\offset+R10(%rsp) + movq %\_r9,\offset+R9(%rsp) + movq %rax,\offset+R8(%rsp) + .endm + + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + .macro LOAD_ARGS32 offset, _r9=0 + .if \_r9 + movl \offset+16(%rsp),%r9d + .endif + movl \offset+40(%rsp),%ecx + movl \offset+48(%rsp),%edx + movl \offset+56(%rsp),%esi + movl \offset+64(%rsp),%edi + movl %eax,%eax /* zero extension */ + .endm + + .macro CFI_STARTPROC32 simple + CFI_STARTPROC \simple + CFI_UNDEFINED r8 + CFI_UNDEFINED r9 + CFI_UNDEFINED r10 + CFI_UNDEFINED r11 + CFI_UNDEFINED r12 + CFI_UNDEFINED r13 + CFI_UNDEFINED r14 + CFI_UNDEFINED r15 + .endm + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) + +ENTRY(native_irq_enable_sysexit) + swapgs + sti + sysexit +ENDPROC(native_irq_enable_sysexit) +#endif + +/* + * 32bit SYSENTER instruction entry. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx Arg2 + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp user stack + * 0(%ebp) Arg6 + * + * Interrupts on. + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. Set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 + movq 8(%rsp),%r11 + CFI_RESTORE r11 + popq_cfi %rcx + CFI_RESTORE rcx + movl %ebp,%ebp /* zero extension */ + movl %eax,%eax + movl TI_sysenter_return+THREAD_INFO(%rsp,8*6-KERNEL_STACK_OFFSET),%r10d + movl $__USER32_DS,40(%rsp) + movq %rbp,32(%rsp) + movl $__USER32_CS,16(%rsp) + movq %r10,8(%rsp) + movq %rax,(%rsp) + cld + SAVE_ARGS 0,1,0 + /* no need to do an access_ok check here because rbp has been + 32bit zero extended */ +1: movl (%rbp),%ebp + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jnz sysenter_tracesys + jmp .Lia32_check_call + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi,%r9d /* 6th arg: 4th syscall arg */ + movl %edx,%r8d /* 5th arg: 3rd syscall arg */ + /* (already in %ecx) 4th arg: 2nd syscall arg */ + movl %ebx,%edx /* 3rd arg: 1st syscall arg */ + movl %eax,%esi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ + call __audit_syscall_entry + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ + cmpq $(IA32_NR_syscalls-1),%rax + ja ia32_badsys + movl %ebx,%edi /* reload 1st syscall arg */ + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp,%r9d /* reload 6th syscall arg */ + jmp .Lia32_dispatch +#endif + CFI_ENDPROC +ENDPROC(ia32_sysenter_target) + +/* + * 32bit SYSCALL instruction entry. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx return EIP + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp Arg2 [note: not saved in the stack frame, should not be touched] + * %esp user stack + * 0(%esp) Arg6 + * + * Interrupts on. + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. Set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + movl %eax,%eax /* zero extension */ + movl RSP-RIP+16(%rsp),%r8d + SAVE_ARGS -8,0,0 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ + movl %ebp,%ecx + movl $__USER32_CS,CS-ARGOFFSET(%rsp) + movl $__USER32_DS,SS-ARGOFFSET(%rsp) + /* no need to do an access_ok check here because r8 has been + 32bit zero extended */ + /* hardware stack frame is complete now */ +1: movl (%r8),%r9d + .section __ex_table,"a" + .quad 1b,ia32_badarg + .previous + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jnz cstar_tracesys + cmpq $IA32_NR_syscalls-1,%rax + ja ia32_badsys +cstar_do_call: + IA32_ARG_FIXUP 1 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ + auditsys_entry_common + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ + jmp .Lia32_dispatch +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jz cstar_auditsys +#endif + xchgl %r9d,%ebp + SAVE_REST + CLEAR_RREGS 0, r9 + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ + RESTORE_REST + xchgl %ebp,%r9d + cmpq $(IA32_NR_syscalls-1),%rax + ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + movq $-EFAULT,%rax + jmp ia32_sysret + CFI_ENDPROC + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * %eax System call number. + * %ebx Arg1 + * %ecx Arg2 + * %edx Arg3 + * %esi Arg4 + * %edi Arg5 + * %ebp Arg6 [note: not saved in the stack frame, should not be touched] + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except %eax must be saved (but ptrace may violate that) + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts on. + */ + +ENTRY(ia32_syscall) + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-RIP+16 + /*CFI_REL_OFFSET ss,SS-RIP+16*/ + CFI_REL_OFFSET rsp,RSP-RIP+16 + /*CFI_REL_OFFSET rflags,EFLAGS-RIP+16*/ + /*CFI_REL_OFFSET cs,CS-RIP+16*/ + CFI_REL_OFFSET rip,RIP-RIP+16 + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 + movq 8(%rsp),%r11 + CFI_RESTORE r11 + popq_cfi %rcx + CFI_RESTORE rcx + movl %eax,%eax + movq %rax,(%rsp) + cld + /* note the registers are not zero extended to the sf. + this could be a problem. */ + SAVE_ARGS 0,1,0 + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jnz ia32_tracesys +.Lia32_check_call: + cmpq $(IA32_NR_syscalls-1),%rax + ja ia32_badsys +ia32_do_call: + IA32_ARG_FIXUP +.Lia32_dispatch: + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative +ia32_sysret: + movq %rax,RAX-ARGOFFSET(%rsp) + CLEAR_RREGS -ARGOFFSET + jmp int_ret_from_sys_call + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jz sysenter_auditsys +#endif +ia32_tracesys: + SAVE_REST + CLEAR_RREGS + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + RESTORE_REST + cmpq $(IA32_NR_syscalls-1),%rax + ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ + jmp ia32_do_call +END(ia32_syscall) + +ia32_badsys: + movq $0,ORIG_RAX-ARGOFFSET(%rsp) + movq $-ENOSYS,%rax + jmp ia32_sysret + + CFI_ENDPROC + + .macro PTREGSCALL label, func, arg + ALIGN +GLOBAL(\label) + leaq \func(%rip),%rax + leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ + jmp ia32_ptregs_common + .endm + + CFI_STARTPROC32 + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi + PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi + PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx + PTREGSCALL stub32_execve, sys32_execve, %rcx + PTREGSCALL stub32_fork, sys_fork, %rdi + PTREGSCALL stub32_clone, sys32_clone, %rdx + PTREGSCALL stub32_vfork, sys_vfork, %rdi + PTREGSCALL stub32_iopl, sys_iopl, %rsi + + ALIGN +ia32_ptregs_common: + popq %r11 + CFI_ENDPROC + CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,SS+8-ARGOFFSET + CFI_REL_OFFSET rax,RAX-ARGOFFSET + CFI_REL_OFFSET rcx,RCX-ARGOFFSET + CFI_REL_OFFSET rdx,RDX-ARGOFFSET + CFI_REL_OFFSET rsi,RSI-ARGOFFSET + CFI_REL_OFFSET rdi,RDI-ARGOFFSET + CFI_REL_OFFSET rip,RIP-ARGOFFSET +/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ +/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ + CFI_REL_OFFSET rsp,RSP-ARGOFFSET +/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ + SAVE_REST + call *%rax + RESTORE_REST + jmp ia32_sysret /* misbalances the return cache */ + CFI_ENDPROC +END(ia32_ptregs_common) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 610001d..d39db8a 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -31,6 +31,10 @@ #include #include +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#include +#endif + #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -115,7 +119,11 @@ static inline void acpi_disable_pci(void) } /* Low-level suspend routine. */ +#ifdef CONFIG_ACPI_PV_SLEEP +#define acpi_suspend_lowlevel() acpi_enter_sleep_state(ACPI_STATE_S3) +#else extern int acpi_suspend_lowlevel(void); +#endif extern const unsigned char acpi_wakeup_code[]; #define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code))) @@ -123,11 +131,33 @@ extern const unsigned char acpi_wakeup_code[]; /* early initialization routine */ extern void acpi_reserve_wakeup_memory(void); +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +static inline int acpi_notify_hypervisor_state(u8 sleep_state, + u32 pm1a_cnt_val, + u32 pm1b_cnt_val) +{ + struct xen_platform_op op = { + .cmd = XENPF_enter_acpi_sleep, + .interface_version = XENPF_INTERFACE_VERSION, + .u = { + .enter_acpi_sleep = { + .pm1a_cnt_val = pm1a_cnt_val, + .pm1b_cnt_val = pm1b_cnt_val, + .sleep_state = sleep_state, + }, + }, + }; + + return HYPERVISOR_platform_op(&op); +} +#endif + /* * Check if the CPU can handle C2 and deeper */ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) { +#ifndef CONFIG_PROCESSOR_EXTERNAL_CONTROL /* * Early models (<=5) of AMD Opterons are not supposed to go into * C2 state. @@ -142,6 +172,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) else if (amd_e400_c1e_detected) return 1; else +#endif return max_cstate; } @@ -181,7 +212,9 @@ static inline void disable_acpi(void) { } #endif /* !CONFIG_ACPI */ +#ifndef CONFIG_XEN #define ARCH_HAS_POWER_INIT 1 +#endif #ifdef CONFIG_ACPI_NUMA extern int acpi_numa; diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index eec2a70..91e72c0 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -15,6 +15,9 @@ #define map_page_into_agp(page) set_pages_uc(page, 1) #define unmap_page_from_agp(page) set_pages_wb(page, 1) +#define map_pages_into_agp set_pages_array_uc +#define unmap_pages_from_agp set_pages_array_wb + /* * Could use CLFLUSH here if the cpu supports it. But then it would * need to be called for each cacheline of the whole page so it may diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3ab9bdd..be3c6f6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -9,12 +9,16 @@ #include #include #include +#ifndef CONFIG_XEN #include +#endif #include #include #include +#ifndef CONFIG_XEN #define ARCH_APICTIMER_STOPS_ON_C3 1 +#endif /* * Debugging macros @@ -46,6 +50,7 @@ static inline void generic_apic_probe(void) #ifdef CONFIG_X86_LOCAL_APIC extern unsigned int apic_verbosity; +#ifndef CONFIG_XEN extern int local_apic_timer_c2_ok; extern int disable_apic; @@ -119,6 +124,8 @@ extern u64 native_apic_icr_read(void); extern int x2apic_mode; +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_X2APIC /* * Make previous memory operations globally visible before @@ -238,7 +245,11 @@ extern void setup_local_APIC(void); extern void end_local_APIC_setup(void); extern void bsp_end_local_APIC_setup(void); extern void init_apic_mappings(void); +#ifndef CONFIG_XEN void register_lapic_address(unsigned long address); +#else +#define register_lapic_address(address) +#endif extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); @@ -286,15 +297,18 @@ static inline void disable_local_APIC(void) { } struct apic { char *name; +#ifndef CONFIG_XEN int (*probe)(void); int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id); int (*apic_id_registered)(void); +#endif u32 irq_delivery_mode; u32 irq_dest_mode; const struct cpumask *(*target_cpus)(void); +#ifndef CONFIG_XEN int disable_esr; int dest_logical; @@ -313,8 +327,10 @@ struct apic { void (*setup_portio_remap)(void); int (*check_phys_apicid_present)(int phys_apicid); void (*enable_apic_mode)(void); +#endif int (*phys_pkg_id)(int cpuid_apic, int index_msb); +#ifndef CONFIG_XEN /* * When one of the next two hooks returns 1 the apic * is switched to this. Essentially they are additional @@ -329,6 +345,7 @@ struct apic { unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, const struct cpumask *andmask); +#endif /* ipi */ void (*send_IPI_mask)(const struct cpumask *mask, int vector); @@ -338,6 +355,7 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); +#ifndef CONFIG_XEN /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); @@ -377,6 +395,7 @@ struct apic { */ int (*x86_32_numa_cpu_node)(int cpu); #endif +#endif /* CONFIG_XEN */ }; /* @@ -386,6 +405,8 @@ struct apic { */ extern struct apic *apic; +#ifndef CONFIG_XEN + /* * APIC drivers are probed based on how they are listed in the .apicdrivers * section. So the order is important and enforced by the ordering @@ -504,6 +525,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert) extern void generic_bigsmp_probe(void); +#endif /* CONFIG_XEN */ #ifdef CONFIG_X86_LOCAL_APIC @@ -520,6 +542,8 @@ static inline const struct cpumask *default_target_cpus(void) #endif } +#ifndef CONFIG_XEN + DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); @@ -626,6 +650,8 @@ extern int default_cpu_present_to_apicid(int mps_cpu); extern int default_check_phys_apicid_present(int phys_apicid); #endif +#endif /* CONFIG_XEN */ + #endif /* CONFIG_X86_LOCAL_APIC */ #endif /* _ASM_X86_APIC_H */ diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 134bba0..96fd18b 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -17,6 +17,8 @@ */ #define IO_APIC_SLOT_SIZE 1024 +#ifndef CONFIG_XEN + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -147,6 +149,16 @@ #define XAPIC_ENABLE (1UL << 11) #define X2APIC_ENABLE (1UL << 10) +#else /* CONFIG_XEN */ + +enum { + APIC_DEST_ALLBUT = 0x1, + APIC_DEST_SELF, + APIC_DEST_ALLINC +}; + +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 # define MAX_LOCAL_APIC 256 @@ -155,6 +167,8 @@ # define MAX_LOCAL_APIC 32768 #endif +#ifndef CONFIG_XEN + /* * All x86-64 systems are xAPIC compatible. * In the following, "apicid" is a physical APIC ID. @@ -425,6 +439,8 @@ struct local_apic { #undef u32 +#endif /* CONFIG_XEN */ + #ifdef CONFIG_X86_32 #define BAD_APICID 0xFFu #else diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 5e1a2ee..2d2275a 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -16,7 +16,7 @@ & ~(CONFIG_PHYSICAL_ALIGN - 1)) /* Minimum kernel alignment, as a power of two */ -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) #define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT #else #define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT + THREAD_ORDER) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index dcb839e..dc9a154 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -293,7 +293,11 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1) #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) +#ifndef CONFIG_XEN #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#else +#define cpu_has_xsave boot_cpu_has(X86_FEATURE_OSXSAVE) +#endif #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b903d5e..32cc72e 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -101,7 +101,7 @@ extern void aout_dump_debugregs(struct user *dump); extern void hw_breakpoint_restore(void); -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_X86_NO_IDT) DECLARE_PER_CPU(int, debug_stack_usage); static inline void debug_stack_usage_inc(void) { diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 3778256..ba52483 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -66,7 +66,11 @@ struct e820map { struct e820entry map[E820_X_MAX]; }; +#ifndef CONFIG_XEN #define ISA_START_ADDRESS 0xa0000 +#else +#define ISA_START_ADDRESS 0 +#endif #define ISA_END_ADDRESS 0x100000 #define BIOS_BEGIN 0x000a0000 diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index da0b3ca..b0ce06c 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -19,7 +19,11 @@ typedef struct { #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; +#ifndef CONFIG_XEN unsigned int irq_tlb_count; +#else + unsigned int irq_lock_count; +#endif #endif #ifdef CONFIG_X86_THERMAL_VECTOR unsigned int irq_thermal_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6e..b54fa50 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -101,6 +101,7 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr, irq_attr->polarity = polarity; } +#ifndef CONFIG_XEN struct irq_2_iommu { struct intel_iommu *iommu; u16 irte_index; @@ -123,6 +124,9 @@ struct irq_cfg { struct irq_2_iommu irq_2_iommu; #endif }; +#else +struct irq_cfg; +#endif extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); extern void send_cleanup_vector(struct irq_cfg *); @@ -159,9 +163,15 @@ extern void smp_invalidate_interrupt(struct pt_regs *); #else extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *); #endif +extern void smp_irq_work_interrupt(struct pt_regs *); +#ifdef CONFIG_XEN +extern void smp_reboot_interrupt(struct pt_regs *); +#endif #endif +#ifndef CONFIG_XEN extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); +#endif typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 7a15153..2bb1d90 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -60,3 +60,7 @@ static inline bool hypervisor_x2apic_available(void) } #endif + +#ifdef HAVE_XEN_PLATFORM_COMPAT_H +#include_next +#endif diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index a203659..b9daf61 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -54,6 +54,7 @@ extern struct irq_chip i8259A_chip; struct legacy_pic { int nr_legacy_irqs; +#ifndef CONFIG_XEN struct irq_chip *chip; void (*mask)(unsigned int irq); void (*unmask)(unsigned int irq); @@ -61,6 +62,7 @@ struct legacy_pic { void (*restore_mask)(void); void (*init)(int auto_eoi); int (*irq_pending)(unsigned int irq); +#endif void (*make_irq)(unsigned int irq); }; diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index d8e8eef..51cd71d 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -331,7 +331,7 @@ extern void early_iounmap(void __iomem *addr, unsigned long size); extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN #include struct bio_vec; @@ -341,7 +341,7 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) -#endif /* CONFIG_XEN */ +#endif /* CONFIG_PARAVIRT_XEN */ #define IO_SPACE_LIMIT 0xffff diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 317ff17..2e587bc 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -5,14 +5,30 @@ # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_PGD 2 +# ifndef CONFIG_XEN # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +# else /* CONFIG_XEN */ +/* + * The hypervisor interface implicitly requires that all entries (except + * for possibly the final one) are arranged in matching PA_/VA_ pairs. +# define VA_PGD 3 + */ +# define PA_SWAP_PAGE 4 +# define PAGES_NR 5 +# endif /* CONFIG_XEN */ #else # define PA_CONTROL_PAGE 0 # define VA_CONTROL_PAGE 1 # define PA_TABLE_PAGE 2 +# ifndef CONFIG_XEN # define PA_SWAP_PAGE 3 # define PAGES_NR 4 +# else /* CONFIG_XEN, see comment above +# define VA_TABLE_PAGE 3 */ +# define PA_SWAP_PAGE 4 +# define PAGES_NR 5 +# endif /* CONFIG_XEN */ #endif # define KEXEC_CONTROL_CODE_MAX_SIZE 2048 @@ -163,6 +179,19 @@ struct kimage_arch { }; #endif +/* Under Xen we need to work with machine addresses. These macros give the + * machine address of a certain page to the generic kexec code instead of + * the pseudo physical address which would be given by the default macros. + */ + +#ifdef CONFIG_XEN +#define KEXEC_ARCH_HAS_PAGE_MACROS +#define kexec_page_to_pfn(page) pfn_to_mfn(page_to_pfn(page)) +#define kexec_pfn_to_page(pfn) pfn_to_page(mfn_to_pfn(pfn)) +#define kexec_virt_to_phys(addr) virt_to_machine(addr) +#define kexec_phys_to_virt(addr) phys_to_virt(machine_to_phys(addr)) +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h index a01e7ec7..a39d3e1 100644 --- a/arch/x86/include/asm/mach_traps.h +++ b/arch/x86/include/asm/mach_traps.h @@ -5,6 +5,8 @@ #ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H #define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H +#include +#include #include #define NMI_REASON_PORT 0x61 @@ -22,6 +24,29 @@ static inline unsigned char default_get_nmi_reason(void) return inb(NMI_REASON_PORT); } +static inline void clear_serr_error(unsigned char reason) +{ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; + outb(reason, NMI_REASON_PORT); +} + +static inline void clear_io_check_error(unsigned char reason) +{ + unsigned long i; + + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); + + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } + + reason &= ~NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); +} + static inline void reassert_nmi(void) { int old_reg = -1; diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 0e8e85b..6c8b3a8 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -14,7 +14,7 @@ #define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */ #endif -#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG) +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) && defined(__HAVE_ARCH_CMPXCHG) /* * This lock provides nmi access to the CMOS/RTC registers. It has some * special properties. It is owned by a CPU and stores the index register diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5f55e69..e9b162a 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -16,12 +16,15 @@ typedef struct { /* True if mm supports a task running in 32 bit compatibility mode. */ unsigned short ia32_compat; #endif +#ifdef CONFIG_XEN + bool has_foreign_mappings:1; +#endif struct mutex lock; void *vdso; } mm_context_t; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) void leave_mm(int cpu); #else static inline void leave_mm(int cpu) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index fd3f9f1..d3d8968 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,7 +17,10 @@ struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; +#endif +#if (defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN)) || \ + (defined(CONFIG_XEN_SMPBOOT) && CONFIG_XEN_COMPAT >= 0x030200) void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace #endif diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 7639dbf..63775df 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -69,7 +69,15 @@ extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_FLATMEM +/* + * While max_pfn is not exported, max_mapnr never gets initialized for non-Xen + * other than for hotplugged memory. + */ +#ifndef CONFIG_XEN #define pfn_valid(pfn) ((pfn) < max_pfn) +#else +#define pfn_valid(pfn) ((pfn) < max_mapnr) +#endif #endif #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 3566454..b229c07 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -133,6 +133,8 @@ struct pt_regs { #include #ifdef CONFIG_PARAVIRT #include +#elif defined(CONFIG_X86_64_XEN) +#include #endif struct cpuinfo_x86; @@ -193,7 +195,13 @@ static inline int v8086_mode(struct pt_regs *regs) #ifdef CONFIG_X86_64 static inline bool user_64bit_mode(struct pt_regs *regs) { -#ifndef CONFIG_PARAVIRT +#if defined(CONFIG_XEN) + /* + * On Xen, these are the only long mode CPL 3 selectors. + * We do not allow long mode selectors in the LDT. + */ + return regs->cs == __USER_CS || regs->cs == FLAT_USER_CS64; +#elif !defined(CONFIG_PARAVIRT) /* * On non-paravirt systems, this is the only long mode CPL 3 * selector. We do not allow long mode selectors in the LDT. @@ -286,7 +294,9 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, } #define arch_has_single_step() (1) -#ifdef CONFIG_X86_DEBUGCTLMSR +#if defined(CONFIG_XEN) +#define arch_has_block_step() (0) +#elif defined(CONFIG_X86_DEBUGCTLMSR) #define arch_has_block_step() (1) #else #define arch_has_block_step() (boot_cpu_data.x86 >= 6) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 6c7fc25..b0549bf 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -48,7 +48,7 @@ #endif #ifdef CONFIG_X86_64 -#ifdef CONFIG_PARAVIRT +#if defined(CONFIG_PARAVIRT) || defined(CONFIG_XEN) /* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 #define NEED_PGE 0 diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5e64171..1a47771 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -188,7 +188,9 @@ #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) -#ifndef CONFIG_PARAVIRT +#if defined(CONFIG_X86_XEN) +#define get_kernel_rpl() (xen_feature(XENFEAT_supervisor_mode_kernel)?0:1) +#elif !defined(CONFIG_PARAVIRT) #define get_kernel_rpl() 0 #endif diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index cfd8144..8d30476 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,6 +95,9 @@ struct thread_info { #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#if defined(CONFIG_X86_XEN) && defined(CONFIG_CPU_SUP_AMD) +#define TIF_CSTAR 31 /* cstar-based syscall (special handling) */ +#endif #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -116,6 +119,7 @@ struct thread_info { #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_CSTAR (1 << TIF_CSTAR) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ @@ -143,9 +147,13 @@ struct thread_info { _TIF_USER_RETURN_NOTIFY) /* flags to check in __switch_to() */ +#ifndef CONFIG_XEN #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) +#else +#define _TIF_WORK_CTXSW (_TIF_NOTSC /*todo | _TIF_BLOCKSTEP */) +#endif #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index b9676ae..7fb31d4 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -30,7 +30,7 @@ # define ENABLE_TOPO_DEFINES # endif #else -# ifdef CONFIG_SMP +# if defined(CONFIG_SMP) && !defined(CONFIG_XEN) # define ENABLE_TOPO_DEFINES # endif #endif diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index feca311..0a55878 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -1,4 +1,4 @@ -#ifndef _ASM_X86_TRAMPOLINE_H +#if !defined(_ASM_X86_TRAMPOLINE_H) && !defined(CONFIG_XEN) #define _ASM_X86_TRAMPOLINE_H #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0012d09..37da0d0 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -40,6 +40,9 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); +#ifdef CONFIG_X86_XEN +asmlinkage void fixup_4gb_segment(void); +#endif dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -68,6 +71,9 @@ dotraplinkage void do_machine_check(struct pt_regs *, long); dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); +#ifdef CONFIG_XEN +void do_fixup_4gb_segment(struct pt_regs *, long); +#endif #endif static inline int get_si_code(unsigned long condition) diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 21f7385..1c44df1 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -11,7 +11,7 @@ #ifndef _ASM_X86_UV_UV_HUB_H #define _ASM_X86_UV_UV_HUB_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_UV #include #include #include diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 5728852..9219196 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -48,6 +48,7 @@ #include #include #include +#include /* * The hypercall asms have to meet several constraints: diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 66d0fff..41ff2bd 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -58,7 +58,7 @@ static inline uint32_t xen_cpuid_base(void) return 0; } -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN extern bool xen_hvm_need_lapic(void); static inline bool xen_x2apic_para_available(void) diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index a1f2db5..40c95d2 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -10,17 +10,20 @@ #define _ASM_X86_XEN_INTERFACE_H #ifdef __XEN__ -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name #else -#define __DEFINE_GUEST_HANDLE(name, type) \ +#define ___DEFINE_XEN_GUEST_HANDLE(name, type) \ typedef type * __guest_handle_ ## name #endif +#define __DEFINE_XEN_GUEST_HANDLE(name, type) \ + ___DEFINE_XEN_GUEST_HANDLE(name, type); \ + ___DEFINE_XEN_GUEST_HANDLE(const_##name, const type) #define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name + __DEFINE_XEN_GUEST_HANDLE(name, struct name) +#define DEFINE_XEN_GUEST_HANDLE(name) __DEFINE_XEN_GUEST_HANDLE(name, name) +#define XEN_GUEST_HANDLE(name) __guest_handle_ ## name #ifdef __XEN__ #if defined(__i386__) @@ -47,15 +50,8 @@ #endif #ifndef __ASSEMBLY__ -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); -DEFINE_GUEST_HANDLE(void); -DEFINE_GUEST_HANDLE(uint64_t); +typedef unsigned long xen_pfn_t; +typedef unsigned long xen_ulong_t; #endif #ifndef HYPERVISOR_VIRT_START @@ -67,7 +63,7 @@ DEFINE_GUEST_HANDLE(uint64_t); #define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT) /* Maximum number of virtual CPUs in multi-processor guests. */ -#define MAX_VIRT_CPUS 32 +#define XEN_LEGACY_MAX_VCPUS 32 /* * SEGMENT DESCRIPTOR TABLES diff --git a/arch/x86/include/mach-xen/asm/agp.h b/arch/x86/include/mach-xen/asm/agp.h new file mode 100644 index 0000000..e2a122d --- /dev/null +++ b/arch/x86/include/mach-xen/asm/agp.h @@ -0,0 +1,58 @@ +#ifndef _ASM_X86_AGP_H +#define _ASM_X86_AGP_H + +#include +#include +#include + +/* + * Functions to keep the agpgart mappings coherent with the MMU. The + * GART gives the CPU a physical alias of pages in memory. The alias + * region is mapped uncacheable. Make sure there are no conflicting + * mappings with different cachability attributes for the same + * page. This avoids data corruption on some CPUs. + */ + +#define map_page_into_agp(page) ( \ + xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ + ?: set_pages_uc(page, 1)) +#define unmap_page_from_agp(page) ( \ + xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ + set_pages_wb(page, 1)) + +#define map_pages_into_agp(pages, nr) ({ \ + __typeof__(nr) n__; \ + int rc__ = 0; \ + for (n__ = 0; n__ < (nr) && !rc__; ++n__) \ + rc__ = xen_create_contiguous_region( \ + (unsigned long)page_address((pages)[n__]), 0, 32); \ + rc__ ?: set_pages_array_uc(pages, nr); \ +}) +#define unmap_pages_from_agp(pages, nr) ({ \ + __typeof__(nr) n__; \ + for (n__ = 0; n__ < nr; ++n__) \ + xen_destroy_contiguous_region( \ + (unsigned long)page_address((pages)[n__]), 0); \ + /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ + set_pages_array_wb(pages, nr); \ +}) + +/* + * Could use CLFLUSH here if the cpu supports it. But then it would + * need to be called for each cacheline of the whole page so it may + * not be worth it. Would need a page for it. + */ +#define flush_agp_cache() wbinvd() + +#define virt_to_gart virt_to_machine + +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ +#define alloc_gatt_pages(order) ({ \ + char *_t; dma_addr_t _d; \ + _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \ + _t; }) +#define free_gatt_pages(table, order) \ + dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table)) + +#endif /* _ASM_X86_AGP_H */ diff --git a/arch/x86/include/mach-xen/asm/cmpxchg.h b/arch/x86/include/mach-xen/asm/cmpxchg.h new file mode 100644 index 0000000..17fde1d --- /dev/null +++ b/arch/x86/include/mach-xen/asm/cmpxchg.h @@ -0,0 +1,11 @@ +#ifndef _ASM_X86_XEN_CMPXCHG_H +#define _ASM_X86_XEN_CMPXCHG_H + +#include_next +#ifdef CONFIG_X86_32 +# include "cmpxchg_32.h" +#else +# include "cmpxchg_64.h" +#endif + +#endif /* _ASM_X86_XEN_CMPXCHG_H */ diff --git a/arch/x86/include/mach-xen/asm/cmpxchg_32.h b/arch/x86/include/mach-xen/asm/cmpxchg_32.h new file mode 100644 index 0000000..9effb00 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/cmpxchg_32.h @@ -0,0 +1,24 @@ +#ifndef _ASM_X86_XEN_CMPXCHG_32_H +#define _ASM_X86_XEN_CMPXCHG_32_H + +static inline u64 get_64bit(const volatile u64 *ptr) +{ + u64 res; + __asm__("movl %%ebx,%%eax\n" + "movl %%ecx,%%edx\n" + LOCK_PREFIX "cmpxchg8b %1" + : "=&A" (res) : "m" (*ptr)); + return res; +} + +static inline u64 get_64bit_local(const volatile u64 *ptr) +{ + u64 res; + __asm__("movl %%ebx,%%eax\n" + "movl %%ecx,%%edx\n" + "cmpxchg8b %1" + : "=&A" (res) : "m" (*ptr)); + return res; +} + +#endif /* _ASM_X86_XEN_CMPXCHG_32_H */ diff --git a/arch/x86/include/mach-xen/asm/cmpxchg_64.h b/arch/x86/include/mach-xen/asm/cmpxchg_64.h new file mode 100644 index 0000000..092b27b --- /dev/null +++ b/arch/x86/include/mach-xen/asm/cmpxchg_64.h @@ -0,0 +1,11 @@ +#ifndef _ASM_X86_XEN_CMPXCHG_64_H +#define _ASM_X86_XEN_CMPXCHG_64_H + +static inline u64 get_64bit(const volatile u64 *ptr) +{ + return *ptr; +} + +#define get_64bit_local get_64bit + +#endif /* _ASM_X86_XEN_CMPXCHG_64_H */ diff --git a/arch/x86/include/mach-xen/asm/desc.h b/arch/x86/include/mach-xen/asm/desc.h new file mode 100644 index 0000000..14862a0 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/desc.h @@ -0,0 +1,433 @@ +#ifndef _ASM_X86_DESC_H +#define _ASM_X86_DESC_H + +#include +#include +#include + +#include + +static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) +{ + desc->limit0 = info->limit & 0x0ffff; + + desc->base0 = (info->base_addr & 0x0000ffff); + desc->base1 = (info->base_addr & 0x00ff0000) >> 16; + + desc->type = (info->read_exec_only ^ 1) << 1; + desc->type |= info->contents << 2; + + desc->s = 1; + desc->dpl = 0x3; + desc->p = info->seg_not_present ^ 1; + desc->limit = (info->limit & 0xf0000) >> 16; + desc->avl = info->useable; + desc->d = info->seg_32bit; + desc->g = info->limit_in_pages; + + desc->base2 = (info->base_addr & 0xff000000) >> 24; + /* + * Don't allow setting of the lm bit. It would confuse + * user_64bit_mode and would get overridden by sysret anyway. + */ + desc->l = 0; +} + +#ifndef CONFIG_X86_NO_IDT +extern struct desc_ptr idt_descr; +extern gate_desc idt_table[]; +extern struct desc_ptr nmi_idt_descr; +extern gate_desc nmi_idt_table[]; +#endif + +struct gdt_page { + struct desc_struct gdt[GDT_ENTRIES]; +} __attribute__((aligned(PAGE_SIZE))); + +DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); + +static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +{ + return per_cpu(gdt_page, cpu).gdt; +} + +#ifdef CONFIG_X86_64 + +static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate->offset_low = PTR_LOW(func); + gate->segment = __KERNEL_CS; + gate->ist = ist; + gate->p = 1; + gate->dpl = dpl; + gate->zero0 = 0; + gate->zero1 = 0; + gate->type = type; + gate->offset_middle = PTR_MIDDLE(func); + gate->offset_high = PTR_HIGH(func); +} + +#else +static inline void pack_gate(gate_desc *gate, unsigned char type, + unsigned long base, unsigned dpl, unsigned flags, + unsigned short seg) +{ + gate->a = (seg << 16) | (base & 0xffff); + gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); +} + +#endif + +static inline int desc_empty(const void *ptr) +{ + const u32 *desc = ptr; + + return !(desc[0] | desc[1]); +} + +#ifndef CONFIG_XEN +#define load_TR_desc() native_load_tr_desc() +#define load_gdt(dtr) native_load_gdt(dtr) +#define load_idt(dtr) native_load_idt(dtr) +#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) +#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) + +#define store_gdt(dtr) native_store_gdt(dtr) +#define store_idt(dtr) native_store_idt(dtr) +#define store_tr(tr) (tr = native_store_tr()) + +#define load_TLS(t, cpu) native_load_tls(t, cpu) +#define set_ldt native_set_ldt + +#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc) +#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type) +#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g) + +static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) +{ +} + +#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) + +static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate) +{ + memcpy(&idt[entry], gate, sizeof(*gate)); +} + +static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc) +{ + memcpy(&ldt[entry], desc, 8); +} + +static inline void +native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type) +{ + unsigned int size; + + switch (type) { + case DESC_TSS: size = sizeof(tss_desc); break; + case DESC_LDT: size = sizeof(ldt_desc); break; + default: size = sizeof(*gdt); break; + } + + memcpy(&gdt[entry], desc, size); +} +#endif + +static inline void pack_descriptor(struct desc_struct *desc, unsigned long base, + unsigned long limit, unsigned char type, + unsigned char flags) +{ + desc->a = ((base & 0xffff) << 16) | (limit & 0xffff); + desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) | + (limit & 0x000f0000) | ((type & 0xff) << 8) | + ((flags & 0xf) << 20); + desc->p = 1; +} + + +#ifndef CONFIG_XEN +static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) +{ +#ifdef CONFIG_X86_64 + struct ldttss_desc64 *desc = d; + + memset(desc, 0, sizeof(*desc)); + + desc->limit0 = size & 0xFFFF; + desc->base0 = PTR_LOW(addr); + desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->type = type; + desc->p = 1; + desc->limit1 = (size >> 16) & 0xF; + desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; + desc->base3 = PTR_HIGH(addr); +#else + pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); +#endif +} + +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +{ + struct desc_struct *d = get_cpu_gdt_table(cpu); + tss_desc tss; + + /* + * sizeof(unsigned long) coming from an extra "long" at the end + * of the iobitmap. See tss_struct definition in processor.h + * + * -1? seg base+limit should be pointing to the address of the + * last valid byte + */ + set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, + IO_BITMAP_OFFSET + IO_BITMAP_BYTES + + sizeof(unsigned long) - 1); + write_gdt_entry(d, entry, &tss, DESC_TSS); +} + +#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) + +static inline void native_set_ldt(const void *addr, unsigned int entries) +{ + if (likely(entries == 0)) + asm volatile("lldt %w0"::"q" (0)); + else { + unsigned cpu = smp_processor_id(); + ldt_desc ldt; + + set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, + entries * LDT_ENTRY_SIZE - 1); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, + &ldt, DESC_LDT); + asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } +} + +static inline void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} + +static inline void native_load_gdt(const struct desc_ptr *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct desc_ptr *dtr) +{ + asm volatile("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct desc_ptr *dtr) +{ + asm volatile("sidt %0":"=m" (*dtr)); +} + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + + asm volatile("str %0":"=r" (tr)); + + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + unsigned int i; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; +} +#else +#include + +#define load_TLS(t, cpu) xen_load_tls(t, cpu) +#define set_ldt xen_set_ldt + +extern int write_ldt_entry(struct desc_struct *ldt, int entry, + const void *desc); +extern int write_gdt_entry(struct desc_struct *gdt, int entry, + const void *desc, int type); + +static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + unsigned int i; + struct desc_struct *gdt = get_cpu_gdt_table(cpu) + GDT_ENTRY_TLS_MIN; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + if (HYPERVISOR_update_descriptor( + arbitrary_virt_to_machine(&gdt[i]), + *(u64 *)&t->tls_array[i])) + BUG(); +} +#endif + +#define _LDT_empty(info) \ + ((info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0) + +#ifdef CONFIG_X86_64 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) +#else +#define LDT_empty(info) (_LDT_empty(info)) +#endif + +static inline void clear_LDT(void) +{ + set_ldt(NULL, 0); +} + +/* + * load one particular LDT into the current CPU + */ +static inline void load_LDT_nolock(mm_context_t *pc) +{ + set_ldt(pc->ldt, pc->size); +} + +static inline void load_LDT(mm_context_t *pc) +{ + preempt_disable(); + load_LDT_nolock(pc); + preempt_enable(); +} + +static inline unsigned long get_desc_base(const struct desc_struct *desc) +{ + return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); +} + +static inline void set_desc_base(struct desc_struct *desc, unsigned long base) +{ + desc->base0 = base & 0xffff; + desc->base1 = (base >> 16) & 0xff; + desc->base2 = (base >> 24) & 0xff; +} + +static inline unsigned long get_desc_limit(const struct desc_struct *desc) +{ + return desc->limit0 | (desc->limit << 16); +} + +static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) +{ + desc->limit0 = limit & 0xffff; + desc->limit = (limit >> 16) & 0xf; +} + +#ifndef CONFIG_X86_NO_IDT +#ifdef CONFIG_X86_64 +static inline void set_nmi_gate(int gate, void *addr) +{ + gate_desc s; + + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); + write_idt_entry(nmi_idt_table, gate, &s); +} +#endif + +static inline void _set_gate(int gate, unsigned type, void *addr, + unsigned dpl, unsigned ist, unsigned seg) +{ + gate_desc s; + + pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); + /* + * does not need to be atomic because it is only done once at + * setup time + */ + write_idt_entry(idt_table, gate, &s); +} + +/* + * This needs to use 'idt_table' rather than 'idt', and + * thus use the _nonmapped_ version of the IDT, as the + * Pentium F0 0F bugfix can have resulted in the mapped + * IDT being write-protected. + */ +static inline void set_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS); +} + +extern int first_system_vector; +/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */ +extern unsigned long used_vectors[]; + +static inline void alloc_system_vector(int vector) +{ + if (!test_bit(vector, used_vectors)) { + set_bit(vector, used_vectors); + if (first_system_vector > vector) + first_system_vector = vector; + } else { + BUG(); + } +} + +static inline void alloc_intr_gate(unsigned int n, void *addr) +{ + alloc_system_vector(n); + set_intr_gate(n, addr); +} + +/* + * This routine sets up an interrupt gate at directory privilege level 3. + */ +static inline void set_system_intr_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_system_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS); +} + +static inline void set_trap_gate(unsigned int n, void *addr) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS); +} + +static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3)); +} + +static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); +} + +static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) +{ + BUG_ON((unsigned)n > 0xFF); + _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); +} +#endif + +#endif /* _ASM_X86_DESC_H */ diff --git a/arch/x86/include/mach-xen/asm/dma-mapping.h b/arch/x86/include/mach-xen/asm/dma-mapping.h new file mode 100644 index 0000000..5054274 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/dma-mapping.h @@ -0,0 +1,25 @@ +#ifndef _ASM_X86_DMA_MAPPING_H_ + +#define phys_to_dma _phys_to_dma_ +#define dma_to_phys _dma_to_phys_ + +#include_next + +#undef phys_to_dma +#undef dma_to_phys + +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) +{ + return phys_to_machine(paddr); +} + +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) +{ + return machine_to_phys(daddr); +} + +void dma_generic_free_coherent(struct device *, size_t, void *, dma_addr_t); + +extern int range_straddles_page_boundary(paddr_t p, size_t size); + +#endif /* _ASM_X86_DMA_MAPPING_H_ */ diff --git a/arch/x86/include/mach-xen/asm/fixmap.h b/arch/x86/include/mach-xen/asm/fixmap.h new file mode 100644 index 0000000..dccdd97 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/fixmap.h @@ -0,0 +1,240 @@ +/* + * fixmap.h: compile-time virtual memory allocation + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1998 Ingo Molnar + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * x86_32 and x86_64 integration by Gustavo F. Padovan, February 2009 + */ + +#ifndef _ASM_X86_FIXMAP_H +#define _ASM_X86_FIXMAP_H + +#ifndef __ASSEMBLY__ +#include +#include +#include +#ifdef CONFIG_X86_32 +#include +#include +#else +#include +#endif + +/* + * We can't declare FIXADDR_TOP as variable for x86_64 because vsyscall + * uses fixmaps that relies on FIXADDR_TOP for proper address calculation. + * Because of this, FIXADDR_TOP x86 integration was left as later work. + */ +#ifdef CONFIG_X86_32 +/* used by vmalloc.c, vsyscall.lds.S. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap. + */ +extern unsigned long __FIXADDR_TOP; +#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) + +#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) +#else +#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) + +/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ +#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) +#endif + + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. + * for x86_32: We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * These 'compile-time allocated' memory buffers are + * fixed-size 4k pages (or larger if used with an increment + * higher than 1). Use set_fixmap(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ +enum fixed_addresses { +#ifdef CONFIG_X86_32 + FIX_HOLE, + FIX_VDSO, +#else + VSYSCALL_LAST_PAGE, + VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VVAR_PAGE, + VSYSCALL_HPET, +#endif + FIX_DBGP_BASE, + FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_LOCAL_APIC + FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#endif +#ifdef CONFIG_X86_IO_APIC + FIX_IO_APIC_BASE_0, + FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, +#endif +#else + FIX_SHARED_INFO, +#define NR_FIX_ISAMAPS 256 + FIX_ISAMAP_END, + FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, +#endif +#ifdef CONFIG_X86_VISWS_APIC + FIX_CO_CPU, /* Cobalt timer */ + FIX_CO_APIC, /* Cobalt APIC Redirection Table */ + FIX_LI_PCIA, /* Lithium PCI Bridge A */ + FIX_LI_PCIB, /* Lithium PCI Bridge B */ +#endif +#ifdef CONFIG_X86_F00F_BUG + FIX_F00F_IDT, /* Virtual mapping for IDT */ +#endif +#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_CYCLONE_TIMER, /*cyclone timer register*/ +#endif +#ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +#ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, +#endif +#endif +#ifdef CONFIG_PARAVIRT + FIX_PARAVIRT_BOOTMAP, +#endif + FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ + FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ +#ifdef CONFIG_X86_INTEL_MID + FIX_LNW_VRTC, +#endif + __end_of_permanent_fixed_addresses, + + /* + * 256 temporary boot-time mappings, used by early_ioremap(), + * before ioremap() is functional. + * + * If necessary we round it up to the next 256 pages boundary so + * that we can have a single pgd entry and a single pte table: + */ +#define NR_FIX_BTMAPS 64 +#define FIX_BTMAPS_SLOTS 4 +#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS) + FIX_BTMAP_END = + (__end_of_permanent_fixed_addresses ^ + (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) & + -PTRS_PER_PTE + ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - + (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1)) + : __end_of_permanent_fixed_addresses, + FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, +#ifdef CONFIG_X86_32 + FIX_WP_TEST, +#endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, +#endif + __end_of_fixed_addresses +}; + + +extern void reserve_top_address(unsigned long reserve); + +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_BOOT_START (FIXADDR_TOP - FIXADDR_BOOT_SIZE) + +extern int fixmaps_set; + +extern pte_t *kmap_pte; +extern pgprot_t kmap_prot; +extern pte_t *pkmap_page_table; + +void xen_set_fixmap(enum fixed_addresses, phys_addr_t, pgprot_t); + +static inline void __set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t flags) +{ + xen_set_fixmap(idx, phys, flags); +} + +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) + +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) + +#define clear_fixmap(idx) \ + __set_fixmap(idx, 0, __pgprot(0)) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without translation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static __always_inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); +} + +/* Return an pointer with offset calculated */ +static __always_inline unsigned long +__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +{ + __set_fixmap(idx, phys, flags); + return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); +} + +#define set_fixmap_offset(idx, phys) \ + __set_fixmap_offset(idx, phys, PAGE_KERNEL) + +#define set_fixmap_offset_nocache(idx, phys) \ + __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE) + +#endif /* !__ASSEMBLY__ */ +#endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/mach-xen/asm/gnttab_dma.h b/arch/x86/include/mach-xen/asm/gnttab_dma.h new file mode 100644 index 0000000..fd7197c --- /dev/null +++ b/arch/x86/include/mach-xen/asm/gnttab_dma.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2007 Herbert Xu + * Copyright (c) 2007 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _ASM_I386_GNTTAB_DMA_H +#define _ASM_I386_GNTTAB_DMA_H + +static inline int gnttab_dma_local_pfn(struct page *page) +{ + /* Has it become a local MFN? */ + return pfn_valid(mfn_to_local_pfn(pfn_to_mfn(page_to_pfn(page)))); +} + +static inline maddr_t gnttab_dma_map_page(struct page *page) +{ + __gnttab_dma_map_page(page); + return ((maddr_t)pfn_to_mfn(page_to_pfn(page)) << PAGE_SHIFT); +} + +static inline void gnttab_dma_unmap_page(maddr_t maddr) +{ + __gnttab_dma_unmap_page(virt_to_page(bus_to_virt(maddr))); +} + +#endif /* _ASM_I386_GNTTAB_DMA_H */ diff --git a/arch/x86/include/mach-xen/asm/highmem.h b/arch/x86/include/mach-xen/asm/highmem.h new file mode 100644 index 0000000..1243d04 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/highmem.h @@ -0,0 +1,98 @@ +/* + * highmem.h: virtual kernel memory mappings for high memory + * + * Used in CONFIG_HIGHMEM systems for memory pages which + * are not addressable by direct kernel virtual addresses. + * + * Copyright (C) 1999 Gerhard Wichert, Siemens AG + * Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * up to 16 Terabyte physical memory. With current x86 CPUs + * we now support up to 64 Gigabytes physical RAM. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#ifndef _ASM_X86_HIGHMEM_H +#define _ASM_X86_HIGHMEM_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +/* declarations for highmem.c */ +extern unsigned long highstart_pfn, highend_pfn; + +/* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical + * chunk of RAM. + */ +/* + * Ordering is: + * + * FIXADDR_TOP + * fixed_addresses + * FIXADDR_START + * temp fixed addresses + * FIXADDR_BOOT_START + * Persistent kmap area + * PKMAP_BASE + * VMALLOC_END + * Vmalloc area + * VMALLOC_START + * high_memory + */ +#define LAST_PKMAP_MASK (LAST_PKMAP-1) +#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) +#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +extern void *kmap_high(struct page *page); +extern void kunmap_high(struct page *page); + +void *kmap(struct page *page); +void kunmap(struct page *page); + +void *kmap_atomic_prot(struct page *page, pgprot_t prot); +void *__kmap_atomic(struct page *page); +void __kunmap_atomic(void *kvaddr); +void *kmap_atomic_pfn(unsigned long pfn); +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); +struct page *kmap_atomic_to_page(void *ptr); + +#define kmap_atomic_pte(page) \ + kmap_atomic_prot(page, \ + PagePinned(page) ? PAGE_KERNEL_RO : kmap_prot) + +#define flush_cache_kmaps() do { } while (0) + +extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); + +void clear_highpage(struct page *); +static inline void clear_user_highpage(struct page *page, unsigned long vaddr) +{ + clear_highpage(page); +} +#define __HAVE_ARCH_CLEAR_HIGHPAGE +#define clear_user_highpage clear_user_highpage +#define __HAVE_ARCH_CLEAR_USER_HIGHPAGE + +void copy_highpage(struct page *to, struct page *from); +static inline void copy_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_highpage(to, from); +} +#define __HAVE_ARCH_COPY_HIGHPAGE +#define __HAVE_ARCH_COPY_USER_HIGHPAGE + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_HIGHMEM_H */ diff --git a/arch/x86/include/mach-xen/asm/hypercall.h b/arch/x86/include/mach-xen/asm/hypercall.h new file mode 100644 index 0000000..573ce8d --- /dev/null +++ b/arch/x86/include/mach-xen/asm/hypercall.h @@ -0,0 +1,439 @@ +/****************************************************************************** + * hypercall.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * 64-bit updates: + * Benjamin Liu + * Jun Nakajima + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERCALL_H__ +#define __HYPERCALL_H__ + +#ifndef __HYPERVISOR_H__ +# error "please don't include this file directly" +#endif + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +# include +# include +#endif +#if CONFIG_XEN_COMPAT <= 0x030002 +# include /* memcpy() */ +# include +# include +#endif + +#ifdef CONFIG_XEN +#define HYPERCALL_ASM_OPERAND "%c" +#define HYPERCALL_LOCATION(op) (hypercall_page + (op) * 32) +#define HYPERCALL_C_OPERAND(name) "i" (HYPERCALL_LOCATION(__HYPERVISOR_##name)) +#else +#define HYPERCALL_ASM_OPERAND "*%" +#define HYPERCALL_LOCATION(op) (hypercall_stubs + (op) * 32) +#define HYPERCALL_C_OPERAND(name) "g" (HYPERCALL_LOCATION(__HYPERVISOR_##name)) +#endif + +#define HYPERCALL_ARG(arg, n) \ + register typeof((arg)+0) __arg##n asm(HYPERCALL_arg##n) = (arg) + +#define _hypercall0(type, name) \ +({ \ + type __res; \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "1" \ + : "=a" (__res) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall1(type, name, arg) \ +({ \ + type __res; \ + HYPERCALL_ARG(arg, 1); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "2" \ + : "=a" (__res), "+r" (__arg1) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "3" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "4" \ + : "=a" (__res), "+r" (__arg1), \ + "+r" (__arg2), "+r" (__arg3) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "5" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + HYPERCALL_ARG(a5, 5); \ + asm volatile ( \ + "call " HYPERCALL_ASM_OPERAND "6" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \ + : HYPERCALL_C_OPERAND(name) \ + : "memory" ); \ + __res; \ +}) + +#define _hypercall(type, op, a1, a2, a3, a4, a5) \ +({ \ + type __res; \ + HYPERCALL_ARG(a1, 1); \ + HYPERCALL_ARG(a2, 2); \ + HYPERCALL_ARG(a3, 3); \ + HYPERCALL_ARG(a4, 4); \ + HYPERCALL_ARG(a5, 5); \ + asm volatile ( \ + "call *%6" \ + : "=a" (__res), "+r" (__arg1), "+r" (__arg2), \ + "+r" (__arg3), "+r" (__arg4), "+r" (__arg5) \ + : "g" (HYPERCALL_LOCATION(op)) \ + : "memory" ); \ + __res; \ +}) + +#ifdef CONFIG_X86_32 +# include "hypercall_32.h" +#else +# include "hypercall_64.h" +#endif + +static inline int __must_check +HYPERVISOR_set_trap_table( + const trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); +} + +static inline int __must_check +HYPERVISOR_mmu_update( + mmu_update_t *req, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmu_update(req, count, success_count, domid); + return _hypercall4(int, mmu_update, req, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_mmuext_op( + struct mmuext_op *op, unsigned int count, unsigned int *success_count, + domid_t domid) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_mmuext_op(op, count, success_count, domid); + return _hypercall4(int, mmuext_op, op, count, success_count, domid); +} + +static inline int __must_check +HYPERVISOR_set_gdt( + unsigned long *frame_list, unsigned int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + +static inline int __must_check +HYPERVISOR_stack_switch( + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); +} + +static inline int +HYPERVISOR_fpu_taskswitch( + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_sched_op_compat( + int cmd, unsigned long arg) +{ + return _hypercall2(int, sched_op_compat, cmd, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_sched_op( + int cmd, void *arg) +{ + return _hypercall2(int, sched_op, cmd, arg); +} + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +static inline int __must_check +HYPERVISOR_platform_op( + struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, platform_op, platform_op); +} + +static inline int __must_check +HYPERVISOR_mca( + struct xen_mc *mc_op) +{ + mc_op->interface_version = XEN_MCA_INTERFACE_VERSION; + return _hypercall1(int, mca, mc_op); +} +#endif + +static inline int __must_check +HYPERVISOR_set_debugreg( + unsigned int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); +} + +static inline unsigned long __must_check +HYPERVISOR_get_debugreg( + unsigned int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); +} + +static inline int __must_check +HYPERVISOR_memory_op( + unsigned int cmd, void *arg) +{ + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(); + return _hypercall2(int, memory_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_multicall( + multicall_entry_t *call_list, unsigned int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); +} + +static inline int __must_check +HYPERVISOR_event_channel_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, event_channel_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct evtchn_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_xen_version( + int cmd, void *arg) +{ + return _hypercall2(int, xen_version, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_console_io( + int cmd, unsigned int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); +} + +static inline int __must_check +HYPERVISOR_physdev_op( + int cmd, void *arg) +{ + int rc = _hypercall2(int, physdev_op, cmd, arg); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (unlikely(rc == -ENOSYS)) { + struct physdev_op op; + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + memcpy(arg, &op.u, sizeof(op.u)); + } +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_grant_table_op( + unsigned int cmd, void *uop, unsigned int count) +{ + bool fixup = false; + int rc; + + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(); +#ifdef GNTTABOP_map_grant_ref + if (cmd == GNTTABOP_map_grant_ref) +#endif + fixup = gnttab_pre_map_adjust(cmd, uop, count); + rc = _hypercall3(int, grant_table_op, cmd, uop, count); + if (rc == 0 && fixup) + rc = gnttab_post_map_adjust(uop, count); + return rc; +} + +static inline int __must_check +HYPERVISOR_vm_assist( + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int __must_check +HYPERVISOR_vcpu_op( + int cmd, unsigned int vcpuid, void *extra_args) +{ + return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); +} + +static inline int __must_check +HYPERVISOR_suspend( + unsigned long srec) +{ + struct sched_shutdown sched_shutdown = { + .reason = SHUTDOWN_suspend + }; + + int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, + &sched_shutdown, srec); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, + SHUTDOWN_suspend, srec); +#endif + + return rc; +} + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int +HYPERVISOR_nmi_op( + unsigned long op, void *arg) +{ + return _hypercall2(int, nmi_op, op, arg); +} +#endif + +#ifndef CONFIG_XEN +static inline unsigned long __must_check +HYPERVISOR_hvm_op( + int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} +#endif + +static inline int __must_check +HYPERVISOR_callback_op( + int cmd, const void *arg) +{ + return _hypercall2(int, callback_op, cmd, arg); +} + +static inline int __must_check +HYPERVISOR_xenoprof_op( + int op, void *arg) +{ + return _hypercall2(int, xenoprof_op, op, arg); +} + +static inline int __must_check +HYPERVISOR_kexec_op( + unsigned long op, void *args) +{ + return _hypercall2(int, kexec_op, op, args); +} + +struct tmem_op; + +static inline int __must_check +HYPERVISOR_tmem_op( + struct tmem_op *op) +{ + return _hypercall1(int, tmem_op, (void *)op); +} + +#endif /* __HYPERCALL_H__ */ diff --git a/arch/x86/include/mach-xen/asm/hypercall_32.h b/arch/x86/include/mach-xen/asm/hypercall_32.h new file mode 100644 index 0000000..3987b2e --- /dev/null +++ b/arch/x86/include/mach-xen/asm/hypercall_32.h @@ -0,0 +1,62 @@ +#define HYPERCALL_arg1 "ebx" +#define HYPERCALL_arg2 "ecx" +#define HYPERCALL_arg3 "edx" +#define HYPERCALL_arg4 "esi" +#define HYPERCALL_arg5 "edi" + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + return _hypercall4(int, set_callbacks, + event_selector, event_address, + failsafe_selector, failsafe_address); +} +#endif + +static inline long __must_check +HYPERVISOR_set_timer_op( + u64 timeout) +{ + return _hypercall2(long, set_timer_op, + (unsigned long)timeout, + (unsigned long)(timeout>>32)); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + u64 ma, u64 desc) +{ + return _hypercall4(int, update_descriptor, + (unsigned long)ma, (unsigned long)(ma>>32), + (unsigned long)desc, (unsigned long)(desc>>32)); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + unsigned long pte_hi = 0; + + if (arch_use_lazy_mmu_mode()) + return xen_multi_update_va_mapping(va, new_val, flags); +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall4(int, update_va_mapping, va, + new_val.pte_low, pte_hi, flags); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + unsigned long pte_hi = 0; +#ifdef CONFIG_X86_PAE + pte_hi = new_val.pte_high; +#endif + return _hypercall5(int, update_va_mapping_otherdomain, va, + new_val.pte_low, pte_hi, flags, domid); +} diff --git a/arch/x86/include/mach-xen/asm/hypercall_64.h b/arch/x86/include/mach-xen/asm/hypercall_64.h new file mode 100644 index 0000000..97d9445 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/hypercall_64.h @@ -0,0 +1,54 @@ +#define HYPERCALL_arg1 "rdi" +#define HYPERCALL_arg2 "rsi" +#define HYPERCALL_arg3 "rdx" +#define HYPERCALL_arg4 "r10" +#define HYPERCALL_arg5 "r8" + +#if CONFIG_XEN_COMPAT <= 0x030002 +static inline int __must_check +HYPERVISOR_set_callbacks( + unsigned long event_address, unsigned long failsafe_address, + unsigned long syscall_address) +{ + return _hypercall3(int, set_callbacks, + event_address, failsafe_address, syscall_address); +} +#endif + +static inline long __must_check +HYPERVISOR_set_timer_op( + u64 timeout) +{ + return _hypercall1(long, set_timer_op, timeout); +} + +static inline int __must_check +HYPERVISOR_update_descriptor( + unsigned long ma, unsigned long word) +{ + return _hypercall2(int, update_descriptor, ma, word); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping( + unsigned long va, pte_t new_val, unsigned long flags) +{ + if (arch_use_lazy_mmu_mode()) + return xen_multi_update_va_mapping(va, new_val, flags); + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); +} + +static inline int __must_check +HYPERVISOR_update_va_mapping_otherdomain( + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + return _hypercall4(int, update_va_mapping_otherdomain, va, + new_val.pte, flags, domid); +} + +static inline int __must_check +HYPERVISOR_set_segment_base( + int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} diff --git a/arch/x86/include/mach-xen/asm/hypervisor.h b/arch/x86/include/mach-xen/asm/hypervisor.h new file mode 100644 index 0000000..b15cced --- /dev/null +++ b/arch/x86/include/mach-xen/asm/hypervisor.h @@ -0,0 +1,392 @@ +/****************************************************************************** + * hypervisor.h + * + * Linux-specific hypervisor handling. + * + * Copyright (c) 2002-2004, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __HYPERVISOR_H__ +#define __HYPERVISOR_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern shared_info_t *HYPERVISOR_shared_info; + +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT +DECLARE_PER_CPU(struct vcpu_info, vcpu_info); +#define vcpu_info(cpu) (&per_cpu(vcpu_info, cpu)) +#define current_vcpu_info() (&__get_cpu_var(vcpu_info)) +#define vcpu_info_read(fld) percpu_read(vcpu_info.fld) +#define vcpu_info_write(fld, val) percpu_write(vcpu_info.fld, val) +#define vcpu_info_xchg(fld, val) percpu_xchg(vcpu_info.fld, val) +void setup_vcpu_info(unsigned int cpu); +void adjust_boot_vcpu_info(void); +#else +#define vcpu_info(cpu) (HYPERVISOR_shared_info->vcpu_info + (cpu)) +#ifdef CONFIG_SMP +#define current_vcpu_info() vcpu_info(smp_processor_id()) +#else +#define current_vcpu_info() vcpu_info(0) +#endif +#define vcpu_info_read(fld) (current_vcpu_info()->fld) +#define vcpu_info_write(fld, val) (current_vcpu_info()->fld = (val)) +static inline void setup_vcpu_info(unsigned int cpu) {} +#endif + +#ifdef CONFIG_X86_32 +extern unsigned long hypervisor_virt_start; +#endif + +/* arch/xen/i386/kernel/setup.c */ +extern start_info_t *xen_start_info; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#define is_initial_xendomain() (xen_start_info->flags & SIF_INITDOMAIN) +#else +#define is_initial_xendomain() 0 +#endif + +#define init_hypervisor(c) ((void)(c)) +#define init_hypervisor_platform() init_hypervisor(&boot_cpu_data) + +DECLARE_PER_CPU(struct vcpu_runstate_info, runstate); +#define vcpu_running(cpu) (per_cpu(runstate.state, cpu) == RUNSTATE_running) + +/* arch/xen/kernel/evtchn.c */ +/* Force a proper event-channel callback from Xen. */ +void force_evtchn_callback(void); + +/* arch/xen/kernel/process.c */ +void xen_cpu_idle (void); + +/* arch/xen/i386/kernel/hypervisor.c */ +void do_hypervisor_callback(struct pt_regs *regs); + +/* arch/xen/i386/mm/hypervisor.c */ +/* + * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already + * be MACHINE addresses. + */ + +void xen_pt_switch(pgd_t *); +void xen_new_user_pt(pgd_t *); /* x86_64 only */ +void xen_load_gs(unsigned int selector); /* x86_64 only */ +void xen_tlb_flush(void); +void xen_invlpg(unsigned long ptr); + +void xen_l1_entry_update(pte_t *ptr, pte_t val); +void xen_l2_entry_update(pmd_t *ptr, pmd_t val); +void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */ +void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ +void xen_pgd_pin(pgd_t *); +void xen_pgd_unpin(pgd_t *); + +void xen_init_pgd_pin(void); +#ifdef CONFIG_PM_SLEEP +void setup_pfn_to_mfn_frame_list(void *(*)(unsigned long, unsigned long, + unsigned long)); +#endif + +void xen_set_ldt(const void *ptr, unsigned int ents); + +#ifdef CONFIG_SMP +#include +void xen_tlb_flush_all(void); +void xen_invlpg_all(unsigned long ptr); +void xen_tlb_flush_mask(const cpumask_t *mask); +void xen_invlpg_mask(const cpumask_t *mask, unsigned long ptr); +#else +#define xen_tlb_flush_all xen_tlb_flush +#define xen_invlpg_all xen_invlpg +#endif + +/* Returns zero on success else negative errno. */ +int xen_create_contiguous_region( + unsigned long vstart, unsigned int order, unsigned int address_bits); +void xen_destroy_contiguous_region( + unsigned long vstart, unsigned int order); +int early_create_contiguous_region(unsigned long pfn, unsigned int order, + unsigned int address_bits); + +struct page; + +int xen_limit_pages_to_max_mfn( + struct page *pages, unsigned int order, unsigned int address_bits); + +bool __cold hypervisor_oom(void); + +/* Turn jiffies into Xen system time. */ +u64 jiffies_to_st(unsigned long jiffies); + +#ifdef CONFIG_XEN_SCRUB_PAGES +void scrub_pages(void *, unsigned int); +#else +#define scrub_pages(_p,_n) ((void)0) +#endif + +#if defined(CONFIG_XEN) && !defined(MODULE) + +DECLARE_PER_CPU(bool, xen_lazy_mmu); + +void xen_multicall_flush(void); + +int __must_check xen_multi_update_va_mapping(unsigned long va, pte_t, + unsigned long flags); +int __must_check xen_multi_mmu_update(mmu_update_t *, unsigned int count, + unsigned int *success_count, domid_t); +int __must_check xen_multi_mmuext_op(struct mmuext_op *, unsigned int count, + unsigned int *success_count, domid_t); + +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE +static inline void arch_enter_lazy_mmu_mode(void) +{ + percpu_write(xen_lazy_mmu, true); +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + percpu_write(xen_lazy_mmu, false); + xen_multicall_flush(); +} + +#define arch_use_lazy_mmu_mode() unlikely(percpu_read(xen_lazy_mmu)) + +#if 0 /* All uses are in places potentially called asynchronously, but + * asynchronous code should rather not make use of lazy mode at all. + * Therefore, all uses of this function get commented out, proper + * detection of asynchronous invocations is added whereever needed, + * and this function is disabled to catch any new (improper) uses. + */ +static inline void arch_flush_lazy_mmu_mode(void) +{ + if (arch_use_lazy_mmu_mode()) + xen_multicall_flush(); +} +#endif + +#else /* !CONFIG_XEN || MODULE */ + +static inline void xen_multicall_flush(void) {} +#define arch_use_lazy_mmu_mode() false +#define xen_multi_update_va_mapping(...) ({ BUG(); -ENOSYS; }) +#define xen_multi_mmu_update(...) ({ BUG(); -ENOSYS; }) +#define xen_multi_mmuext_op(...) ({ BUG(); -ENOSYS; }) + +#endif /* CONFIG_XEN && !MODULE */ + +#ifdef CONFIG_XEN + +struct gnttab_map_grant_ref; +bool gnttab_pre_map_adjust(unsigned int cmd, struct gnttab_map_grant_ref *, + unsigned int count); +#if CONFIG_XEN_COMPAT < 0x030400 +int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *, unsigned int); +#else +static inline int gnttab_post_map_adjust(const struct gnttab_map_grant_ref *m, + unsigned int count) +{ + BUG(); + return -ENOSYS; +} +#endif + +#else /* !CONFIG_XEN */ + +#define gnttab_pre_map_adjust(...) false +#define gnttab_post_map_adjust(...) ({ BUG(); -ENOSYS; }) + +#endif /* CONFIG_XEN */ + +#if defined(CONFIG_X86_64) +#define MULTI_UVMFLAGS_INDEX 2 +#define MULTI_UVMDOMID_INDEX 3 +#else +#define MULTI_UVMFLAGS_INDEX 3 +#define MULTI_UVMDOMID_INDEX 4 +#endif + +#ifdef CONFIG_XEN +#define is_running_on_xen() 1 +extern char hypercall_page[PAGE_SIZE]; +#else +extern char *hypercall_stubs; +#define is_running_on_xen() (!!hypercall_stubs) +#endif + +#include + +static inline int +HYPERVISOR_yield( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +static inline int +HYPERVISOR_block( + void) +{ + int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); + +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); +#endif + + return rc; +} + +static inline void __noreturn +HYPERVISOR_shutdown( + unsigned int reason) +{ + struct sched_shutdown sched_shutdown = { + .reason = reason + }; + + VOID(HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown)); +#if CONFIG_XEN_COMPAT <= 0x030002 + VOID(HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason)); +#endif + /* Don't recurse needlessly. */ + BUG_ON(reason != SHUTDOWN_crash); + for(;;); +} + +static inline int __must_check +HYPERVISOR_poll( + evtchn_port_t *ports, unsigned int nr_ports, u64 timeout) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports, + .timeout = jiffies_to_st(timeout) + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +static inline int __must_check +HYPERVISOR_poll_no_timeout( + evtchn_port_t *ports, unsigned int nr_ports) +{ + int rc; + struct sched_poll sched_poll = { + .nr_ports = nr_ports + }; + set_xen_guest_handle(sched_poll.ports, ports); + + rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (rc == -ENOSYS) + rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); +#endif + + return rc; +} + +#ifdef CONFIG_XEN + +static inline void +MULTI_update_va_mapping( + multicall_entry_t *mcl, unsigned long va, + pte_t new_val, unsigned long flags) +{ + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = va; +#if defined(CONFIG_X86_64) + mcl->args[1] = new_val.pte; +#elif defined(CONFIG_X86_PAE) + mcl->args[1] = new_val.pte_low; + mcl->args[2] = new_val.pte_high; +#else + mcl->args[1] = new_val.pte_low; + mcl->args[2] = 0; +#endif + mcl->args[MULTI_UVMFLAGS_INDEX] = flags; +} + +static inline void +MULTI_mmu_update(multicall_entry_t *mcl, mmu_update_t *req, + unsigned int count, unsigned int *success_count, + domid_t domid) +{ + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)req; + mcl->args[1] = count; + mcl->args[2] = (unsigned long)success_count; + mcl->args[3] = domid; +} + +static inline void +MULTI_grant_table_op(multicall_entry_t *mcl, unsigned int cmd, + void *uop, unsigned int count) +{ + mcl->op = __HYPERVISOR_grant_table_op; + mcl->args[0] = cmd; + mcl->args[1] = (unsigned long)uop; + mcl->args[2] = count; +} + +#else /* !defined(CONFIG_XEN) */ + +/* Multicalls not supported for HVM guests. */ +#define MULTI_update_va_mapping(a,b,c,d) ((void)0) +#define MULTI_grant_table_op(a,b,c,d) ((void)0) + +#endif + +#define uvm_multi(cpumask) ((unsigned long)cpumask_bits(cpumask) | UVMF_MULTI) + +#ifdef LINUX +/* drivers/staging/ use Windows-style types, including VOID */ +#undef VOID +#endif + +#endif /* __HYPERVISOR_H__ */ diff --git a/arch/x86/include/mach-xen/asm/i387.h b/arch/x86/include/mach-xen/asm/i387.h new file mode 100644 index 0000000..4922b04 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/i387.h @@ -0,0 +1,55 @@ +#ifndef _ASM_X86_I387_H +#define switch_fpu_prepare native_switch_fpu_prepare +#include_next + +#ifndef __ASSEMBLY__ +static inline void xen_thread_fpu_begin(struct task_struct *tsk, + multicall_entry_t *mcl) +{ + if (mcl) { + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 0; + } + __thread_set_has_fpu(tsk); +} + +static inline fpu_switch_t xen_switch_fpu_prepare(struct task_struct *old, + struct task_struct *new, + int cpu, + multicall_entry_t **mcl) +{ + fpu_switch_t fpu; + + fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; + if (__thread_has_fpu(old)) { + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ + + /* Don't change CR0.TS if we just switch! */ + if (fpu.preload) { + new->fpu_counter++; + __thread_set_has_fpu(new); + prefetch(new->thread.fpu.state); + } else { + (*mcl)->op = __HYPERVISOR_fpu_taskswitch; + (*mcl)++->args[0] = 1; + } + } else { + old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; + if (fpu.preload) { + new->fpu_counter++; + if (fpu_lazy_restore(new, cpu)) + fpu.preload = 0; + else + prefetch(new->thread.fpu.state); + xen_thread_fpu_begin(new, (*mcl)++); + } + } + return fpu; +} +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/include/mach-xen/asm/io.h b/arch/x86/include/mach-xen/asm/io.h new file mode 100644 index 0000000..2d07f8a --- /dev/null +++ b/arch/x86/include/mach-xen/asm/io.h @@ -0,0 +1,343 @@ +#ifndef _ASM_X86_IO_H +#define _ASM_X86_IO_H + +/* + * This file contains the definitions for the x86 IO instructions + * inb/inw/inl/outb/outw/outl and the "string versions" of the same + * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" + * versions of the single-IO instructions (inb_p/inw_p/..). + * + * This file is not meant to be obfuscating: it's just complicated + * to (a) handle it all in a way that makes gcc able to optimize it + * as well as possible and (b) trying to avoid writing the same thing + * over and over again with slight variations and possibly making a + * mistake somewhere. + */ + +/* + * Thanks to James van Artsdalen for a better timing-fix than + * the two short jumps: using outb's to a nonexistent port seems + * to guarantee better timings even on fast machines. + * + * On the other hand, I'd like to be sure of a non-existent port: + * I feel a bit unsafe about using 0x80 (should be safe, though) + * + * Linus + */ + + /* + * Bit simplified and optimized by Jan Hubicka + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. + * + * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, + * isa_read[wl] and isa_write[wl] fixed + * - Arnaldo Carvalho de Melo + */ + +#define ARCH_HAS_IOREMAP_WC + +#include +#include +#include +#ifdef __KERNEL__ +#include +#endif + +#define build_mmio_read(name, size, type, reg, barrier) \ +static inline type name(const volatile void __iomem *addr) \ +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \ +:"m" (*(volatile type __force *)addr) barrier); return ret; } + +#define build_mmio_write(name, size, type, reg, barrier) \ +static inline void name(type val, volatile void __iomem *addr) \ +{ asm volatile("mov" size " %0,%1": :reg (val), \ +"m" (*(volatile type __force *)addr) barrier); } + +build_mmio_read(readb, "b", unsigned char, "=q", :"memory") +build_mmio_read(readw, "w", unsigned short, "=r", :"memory") +build_mmio_read(readl, "l", unsigned int, "=r", :"memory") + +build_mmio_read(__readb, "b", unsigned char, "=q", ) +build_mmio_read(__readw, "w", unsigned short, "=r", ) +build_mmio_read(__readl, "l", unsigned int, "=r", ) + +build_mmio_write(writeb, "b", unsigned char, "q", :"memory") +build_mmio_write(writew, "w", unsigned short, "r", :"memory") +build_mmio_write(writel, "l", unsigned int, "r", :"memory") + +build_mmio_write(__writeb, "b", unsigned char, "q", ) +build_mmio_write(__writew, "w", unsigned short, "r", ) +build_mmio_write(__writel, "l", unsigned int, "r", ) + +#define readb_relaxed(a) __readb(a) +#define readw_relaxed(a) __readw(a) +#define readl_relaxed(a) __readl(a) +#define __raw_readb __readb +#define __raw_readw __readw +#define __raw_readl __readl + +#define __raw_writeb __writeb +#define __raw_writew __writew +#define __raw_writel __writel + +#define mmiowb() barrier() + +#ifdef CONFIG_X86_64 + +build_mmio_read(readq, "q", unsigned long, "=r", :"memory") +build_mmio_write(writeq, "q", unsigned long, "r", :"memory") + +#define readq_relaxed(a) readq(a) + +#define __raw_readq(a) readq(a) +#define __raw_writeq(val, addr) writeq(val, addr) + +/* Let people know that we have them */ +#define readq readq +#define writeq writeq + +#endif + +/** + * virt_to_phys - map virtual addresses to physical + * @address: address to remap + * + * The returned physical address is the physical (CPU) mapping for + * the memory address given. It is only valid to use this function on + * addresses directly mapped or allocated via kmalloc. + * + * This function does not give bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline phys_addr_t virt_to_phys(volatile void *address) +{ + return __pa(address); +} + +/** + * phys_to_virt - map physical address to virtual + * @address: address to remap + * + * The returned virtual address is a current CPU mapping for + * the memory address given. It is only valid to use this function on + * addresses that have a kernel mapping + * + * This function does not handle bus mappings for DMA transfers. In + * almost all conceivable cases a device driver should not be using + * this function + */ + +static inline void *phys_to_virt(phys_addr_t address) +{ + return __va(address); +} + +/* + * Change "struct page" to physical address. + */ +#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) +#undef page_to_phys +#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) +#define page_to_bus(page) (phys_to_machine(page_to_pseudophys(page))) + +/* + * ISA I/O bus memory addresses are 1:1 with the physical address. + * However, we truncate the address to unsigned int to avoid undesirable + * promitions in legacy drivers. + */ +#define isa_virt_to_bus(_x) ({ \ + unsigned long _va_ = (unsigned long)(_x); \ + _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) < (NR_FIX_ISAMAPS << PAGE_SHIFT) \ + ? _va_ - fix_to_virt(FIX_ISAMAP_BEGIN) \ + : ({ BUG(); (unsigned long)virt_to_bus(_va_); }); }) +#define isa_bus_to_virt(_x) ((void *)fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) + +/* + * However PCI ones are not necessarily 1:1 and therefore these interfaces + * are forbidden in portable PCI drivers. + * + * Allow them on x86 for legacy drivers, though. + */ +#define virt_to_bus(_x) phys_to_machine(__pa(_x)) +#define bus_to_virt(_x) __va(machine_to_phys(_x)) + +/** + * ioremap - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * If the area you are trying to map is a PCI BAR you should have a + * look at pci_iomap(). + */ +extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, + unsigned long prot_val); + +/* + * The default ioremap() behavior is non-cached: + */ +static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) +{ + return ioremap_nocache(offset, size); +} + +extern void iounmap(volatile void __iomem *addr); + +extern void set_iounmap_nonlazy(void); + +#ifdef __KERNEL__ + +#include + +#include + +/* + * Convert a virtual cached pointer to an uncached pointer + */ +#define xlate_dev_kmem_ptr(p) p + +static inline void +memset_io(volatile void __iomem *addr, unsigned char val, size_t count) +{ + memset((void __force *)addr, val, count); +} + +static inline void +memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count) +{ + memcpy(dst, (const void __force *)src, count); +} + +static inline void +memcpy_toio(volatile void __iomem *dst, const void *src, size_t count) +{ + memcpy((void __force *)dst, src, count); +} + +/* + * Cache management + * + * This needed for two cases + * 1. Out of order aware processors + * 2. Accidentally out of order processors (PPro errata #51) + */ + +static inline void flush_write_buffers(void) +{ +#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) + asm volatile("lock; addl $0,0(%%esp)": : :"memory"); +#endif +} + +#endif /* __KERNEL__ */ + +extern void native_io_delay(void); + +extern int io_delay_type; +extern void io_delay_init(void); + +static inline void slow_down_io(void) +{ + native_io_delay(); +#ifdef REALLY_SLOW_IO + native_io_delay(); + native_io_delay(); + native_io_delay(); +#endif +} + +#define BUILDIO(bwl, bw, type) \ +static inline void out##bwl(unsigned type value, int port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline unsigned type in##bwl(int port) \ +{ \ + unsigned type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} \ + \ +static inline void out##bwl##_p(unsigned type value, int port) \ +{ \ + out##bwl(value, port); \ + slow_down_io(); \ +} \ + \ +static inline unsigned type in##bwl##_p(int port) \ +{ \ + unsigned type value = in##bwl(port); \ + slow_down_io(); \ + return value; \ +} \ + \ +static inline void outs##bwl(int port, const void *addr, unsigned long count) \ +{ \ + asm volatile("rep; outs" #bwl \ + : "+S"(addr), "+c"(count) : "d"(port)); \ +} \ + \ +static inline void ins##bwl(int port, void *addr, unsigned long count) \ +{ \ + asm volatile("rep; ins" #bwl \ + : "+D"(addr), "+c"(count) : "d"(port)); \ +} + +BUILDIO(b, b, char) +BUILDIO(w, w, short) +BUILDIO(l, , int) + +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) + +/* We will be supplying our own /dev/mem implementation */ +#define ARCH_HAS_DEV_MEM + +#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ + (unsigned long)(bv)->bv_offset) + +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ + (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ + && bvec_to_pseudophys(vec1) + (vec1)->bv_len \ + == bvec_to_pseudophys(vec2)) + +#endif + +extern void *xlate_dev_mem_ptr(unsigned long phys); +extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); + +extern int ioremap_check_change_attr(unsigned long mfn, unsigned long size, + unsigned long prot_val); +extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size); + +/* + * early_ioremap() and early_iounmap() are for temporary early boot-time + * mappings, before the real ioremap() is functional. + * A boot-time mapping is currently limited to at most 16 pages. + */ +extern void early_ioremap_init(void); +extern void early_ioremap_reset(void); +extern void __iomem *early_ioremap(resource_size_t phys_addr, + unsigned long size); +extern void __iomem *early_memremap(resource_size_t phys_addr, + unsigned long size); +extern void __iomem *early_memremap_ro(resource_size_t phys_addr, + unsigned long size); +extern void early_iounmap(void __iomem *addr, unsigned long size); +extern void fixup_early_ioremap(void); +extern bool is_early_ioremap_ptep(pte_t *ptep); + +#define IO_SPACE_LIMIT 0xffff + +#endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/mach-xen/asm/ipi.h b/arch/x86/include/mach-xen/asm/ipi.h new file mode 100644 index 0000000..4bdda1d --- /dev/null +++ b/arch/x86/include/mach-xen/asm/ipi.h @@ -0,0 +1,13 @@ +#ifndef _ASM_X86_IPI_H +#define _ASM_X86_IPI_H + +#include +#include + +void xen_send_IPI_mask(const struct cpumask *, int vector); +void xen_send_IPI_mask_allbutself(const struct cpumask *, int vector); +void xen_send_IPI_allbutself(int vector); +void xen_send_IPI_all(int vector); +void xen_send_IPI_self(int vector); + +#endif /* _ASM_X86_IPI_H */ diff --git a/arch/x86/include/mach-xen/asm/irq_vectors.h b/arch/x86/include/mach-xen/asm/irq_vectors.h new file mode 100644 index 0000000..7798731 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/irq_vectors.h @@ -0,0 +1,98 @@ +#ifndef _ASM_X86_IRQ_VECTORS_H +#define _ASM_X86_IRQ_VECTORS_H + +#define MCE_VECTOR 0x12 + +#define IA32_SYSCALL_VECTOR 0x80 +#ifdef CONFIG_X86_32 +# define SYSCALL_VECTOR 0x80 +#endif + +#define RESCHEDULE_VECTOR 0 +#define CALL_FUNCTION_VECTOR 1 +#define NMI_VECTOR 0x02 +#define CALL_FUNC_SINGLE_VECTOR 3 +#define REBOOT_VECTOR 4 +#ifdef CONFIG_IRQ_WORK +#define IRQ_WORK_VECTOR 5 +#define NR_IPIS 6 +#else +#define NR_IPIS 5 +#endif + +/* + * The maximum number of vectors supported by i386 processors + * is limited to 256. For processors other than i386, NR_VECTORS + * should be changed accordingly. + */ +#define NR_VECTORS 256 + +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 + +#ifndef __ASSEMBLY__ +static inline int invalid_vm86_irq(int irq) +{ + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; +} +#endif + +/* + * Size the maximum number of interrupts. + * + * If the irq_desc[] array has a sparse layout, we can size things + * generously - it scales up linearly with the maximum number of CPUs, + * and the maximum number of IO-APICs, whichever is higher. + * + * In other cases we size more conservatively, to not create too large + * static arrays. + */ + +#define NR_IRQS_LEGACY 16 + +/* + * The flat IRQ space is divided into two regions: + * 1. A one-to-one mapping of real physical IRQs. This space is only used + * if we have physical device-access privilege. This region is at the + * start of the IRQ space so that existing device drivers do not need + * to be modified to translate physical IRQ numbers into our IRQ space. + * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These + * are bound using the provided bind/unbind functions. + */ +#define PIRQ_BASE 0 +/* PHYSDEVOP_pirq_eoi_gmfn restriction: */ +#define PIRQ_MAX(n) ((n) < (1 << (PAGE_SHIFT + 3)) - NR_VECTORS \ + ? (n) : (1 << (PAGE_SHIFT + 3)) - NR_VECTORS) + +#define IO_APIC_VECTOR_LIMIT PIRQ_MAX(32 * MAX_IO_APICS) +#define CPU_VECTOR_LIMIT PIRQ_MAX(64 * NR_CPUS) + +#if defined(CONFIG_X86_IO_APIC) +# define NR_PIRQS \ + (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) +#elif defined(CONFIG_XEN_PCIDEV_FRONTEND) +# define NR_PIRQS (NR_VECTORS + CPU_VECTOR_LIMIT) +#else /* !CONFIG_X86_IO_APIC: */ +# define NR_PIRQS NR_IRQS_LEGACY +#endif + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_SPARSE_IRQ +extern int nr_pirqs; +#else +# define nr_pirqs NR_PIRQS +#endif +#endif + +#define DYNIRQ_BASE (PIRQ_BASE + nr_pirqs) +#ifdef CONFIG_SPARSE_IRQ +#define NR_DYNIRQS (CPU_VECTOR_LIMIT + CONFIG_XEN_NR_GUEST_DEVICES) +#else +#define NR_DYNIRQS (64 + CONFIG_XEN_NR_GUEST_DEVICES) +#endif + +#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) + +#endif /* _ASM_X86_IRQ_VECTORS_H */ diff --git a/arch/x86/include/mach-xen/asm/irqflags.h b/arch/x86/include/mach-xen/asm/irqflags.h new file mode 100644 index 0000000..95d336f --- /dev/null +++ b/arch/x86/include/mach-xen/asm/irqflags.h @@ -0,0 +1,212 @@ +#ifndef _X86_IRQFLAGS_H_ +#define _X86_IRQFLAGS_H_ + +#include + +#ifndef __ASSEMBLY__ +#include +#include +/* + * The use of 'barrier' in the following reflects their use as local-lock + * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following + * critical operations are executed. All critical operations must complete + * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also + * includes these barriers, for example. + */ + +#define xen_save_fl(void) vcpu_info_read(evtchn_upcall_mask) + +#define xen_restore_fl(f) \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ + barrier(); /* unmask then check (avoid races) */\ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ + } \ +} while (0) + +#define xen_irq_disable() \ +do { \ + vcpu_info_write(evtchn_upcall_mask, 1); \ + barrier(); \ +} while (0) + +#define xen_irq_enable() \ +do { \ + vcpu_info_t *_vcpu; \ + barrier(); \ + _vcpu = current_vcpu_info(); \ + _vcpu->evtchn_upcall_mask = 0; \ + barrier(); /* unmask then check (avoid races) */ \ + if (unlikely(_vcpu->evtchn_upcall_pending)) \ + force_evtchn_callback(); \ +} while (0) + +#define arch_local_save_flags() xen_save_fl() + +#define arch_local_irq_restore(flags) xen_restore_fl(flags) + +#define arch_local_irq_disable() xen_irq_disable() + +#define arch_local_irq_enable() xen_irq_enable() + +/* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +#define arch_safe_halt HYPERVISOR_block + +/* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +#define halt() VOID(irqs_disabled() \ + ? HYPERVISOR_vcpu_op(VCPUOP_down, \ + smp_processor_id(), NULL) \ + : 0) + +/* + * For spinlocks, etc: + */ +#define arch_local_irq_save() \ +({ \ + unsigned long flags = arch_local_save_flags(); \ + \ + arch_local_irq_disable(); \ + \ + flags; \ +}) +#else + +/* Offsets into shared_info_t. */ +#define evtchn_upcall_pending /* 0 */ +#define evtchn_upcall_mask 1 + +#ifdef CONFIG_X86_64 +# define __REG_si %rsi +# define __CPU_num PER_CPU_VAR(cpu_number) +#else +# define __REG_si %esi +# define __CPU_num TI_cpu(%ebp) +#endif + +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT + +#define GET_VCPU_INFO PER_CPU(vcpu_info, __REG_si) +#define __DISABLE_INTERRUPTS movb $1,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask) +#define __ENABLE_INTERRUPTS movb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_mask) +#define __TEST_PENDING cmpb $0,PER_CPU_VAR(vcpu_info+evtchn_upcall_pending+0) +#define DISABLE_INTERRUPTS(clb) __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS(clb) __ENABLE_INTERRUPTS + +#define __SIZEOF_DISABLE_INTERRUPTS 8 +#define __SIZEOF_TEST_PENDING 8 + +#else /* CONFIG_XEN_VCPU_INFO_PLACEMENT */ + +#define sizeof_vcpu_shift 6 + +#ifdef CONFIG_SMP +#define GET_VCPU_INFO movl __CPU_num,%esi ; \ + shl $sizeof_vcpu_shift,%esi ; \ + add HYPERVISOR_shared_info,__REG_si +#else +#define GET_VCPU_INFO mov HYPERVISOR_shared_info,__REG_si +#endif + +#define __DISABLE_INTERRUPTS movb $1,evtchn_upcall_mask(__REG_si) +#define __ENABLE_INTERRUPTS movb $0,evtchn_upcall_mask(__REG_si) +#define __TEST_PENDING testb $0xFF,evtchn_upcall_pending(__REG_si) +#define DISABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __DISABLE_INTERRUPTS +#define ENABLE_INTERRUPTS(clb) GET_VCPU_INFO ; \ + __ENABLE_INTERRUPTS + +#define __SIZEOF_DISABLE_INTERRUPTS 4 +#define __SIZEOF_TEST_PENDING 3 + +#endif /* CONFIG_XEN_VCPU_INFO_PLACEMENT */ + +#ifndef CONFIG_X86_64 +#define INTERRUPT_RETURN iret +#define ENABLE_INTERRUPTS_SYSEXIT \ + movb $0,evtchn_upcall_mask(%esi) /* __ENABLE_INTERRUPTS */ ; \ +sysexit_scrit: /**** START OF SYSEXIT CRITICAL REGION ****/ ; \ + cmpb $0,evtchn_upcall_pending(%esi) /* __TEST_PENDING */ ; \ + jnz 14f /* process more events if necessary... */ ; \ + movl PT_ESI(%esp), %esi ; \ + sysexit ; \ +14: movb $1,evtchn_upcall_mask(%esi) /* __DISABLE_INTERRUPTS */ ; \ + TRACE_IRQS_OFF ; \ +sysexit_ecrit: /**** END OF SYSEXIT CRITICAL REGION ****/ ; \ + mov $__KERNEL_PERCPU, %ecx ; \ + push %esp ; \ + mov %ecx, %fs ; \ + SET_KERNEL_GS %ecx ; \ + call evtchn_do_upcall ; \ + add $4,%esp ; \ + jmp ret_from_intr +#endif + + +#endif /* __ASSEMBLY__ */ + +#ifndef __ASSEMBLY__ +static inline int arch_irqs_disabled_flags(unsigned long flags) +{ + return (flags != 0); +} + +#define arch_irqs_disabled() \ +({ \ + unsigned long flags = arch_local_save_flags(); \ + \ + arch_irqs_disabled_flags(flags); \ +}) + +#else + +#ifdef CONFIG_X86_64 +#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ + TRACE_IRQS_ON; \ + ENABLE_INTERRUPTS(CLBR_NONE); \ + SAVE_REST; \ + LOCKDEP_SYS_EXIT; \ + RESTORE_REST; \ + __DISABLE_INTERRUPTS; \ + TRACE_IRQS_OFF; + +#else +#define ARCH_LOCKDEP_SYS_EXIT \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call lockdep_sys_exit; \ + popl %edx; \ + popl %ecx; \ + popl %eax; + +#define ARCH_LOCKDEP_SYS_EXIT_IRQ +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +# define TRACE_IRQS_ON call trace_hardirqs_on_thunk; +# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; +#else +# define TRACE_IRQS_ON +# define TRACE_IRQS_OFF +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ +# else +# define LOCKDEP_SYS_EXIT +# define LOCKDEP_SYS_EXIT_IRQ +# endif + +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/x86/include/mach-xen/asm/mach_traps.h b/arch/x86/include/mach-xen/asm/mach_traps.h new file mode 100644 index 0000000..99314d3 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/mach_traps.h @@ -0,0 +1,37 @@ +/* + * include/asm-xen/asm-i386/mach-xen/mach_traps.h + * + * Machine specific NMI handling for Xen + */ +#ifndef _MACH_TRAPS_H +#define _MACH_TRAPS_H + +#include +#include + +#define NMI_REASON_SERR 0x80 +#define NMI_REASON_IOCHK 0x40 +#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK) + +static inline void clear_serr_error(unsigned char reason) {} +static inline void clear_io_check_error(unsigned char reason) {} + +static inline unsigned char xen_get_nmi_reason(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + unsigned char reason = 0; + + /* construct a value which looks like it came from + * port 0x61. + */ + if (test_bit(_XEN_NMIREASON_io_error, &s->arch.nmi_reason)) + reason |= NMI_REASON_IOCHK; + if (test_bit(_XEN_NMIREASON_parity_error, &s->arch.nmi_reason)) + reason |= NMI_REASON_SERR; + + return reason; +} + +static inline void reassert_nmi(void) {} + +#endif /* !_MACH_TRAPS_H */ diff --git a/arch/x86/include/mach-xen/asm/maddr.h b/arch/x86/include/mach-xen/asm/maddr.h new file mode 100644 index 0000000..455e848 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/maddr.h @@ -0,0 +1,155 @@ +#ifndef _X86_MADDR_H +#define _X86_MADDR_H + +#include +#include +#include +#include + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL << (BITS_PER_LONG - 1)) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +/* Definitions for machine and pseudophysical addresses. */ +#ifdef CONFIG_X86_PAE +typedef unsigned long long paddr_t; +typedef unsigned long long maddr_t; +#else +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; +#endif + +#ifdef CONFIG_XEN + +extern unsigned long *phys_to_machine_mapping; +extern unsigned long max_mapnr; + +#undef machine_to_phys_mapping +extern unsigned long *machine_to_phys_mapping; +extern unsigned long machine_to_phys_nr; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return pfn; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return phys_to_machine_mapping[pfn] & ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return 1; + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) + return mfn; + + if (unlikely(mfn >= machine_to_phys_nr)) + return max_mapnr; + + /* The array access can fail (e.g., device space beyond end of RAM). */ + asm ( + "1: "_ASM_MOV" %1,%0\n" + "2:\n" + ".section .fixup,\"ax\"\n" + "3: "_ASM_MOV" %2,%0\n" + " jmp 2b\n" + ".previous\n" + _ASM_EXTABLE(1b,3b) + : "=r" (pfn) + : "m" (machine_to_phys_mapping[mfn]), "m" (max_mapnr) ); + + return pfn; +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(phys_addr_t mfn) +{ + unsigned long pfn = mfn_to_pfn(mfn); + if (likely(pfn < max_mapnr) + && likely(!xen_feature(XENFEAT_auto_translated_physmap)) + && unlikely(phys_to_machine_mapping[pfn] != mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (likely(max_mapnr)) + BUG_ON(pfn >= max_mapnr); + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); + return machine; +} + +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); + return phys; +} + +#ifdef CONFIG_X86_32 +# include "maddr_32.h" +#else +# include "maddr_64.h" +#endif + +#else /* !CONFIG_XEN */ + +#define pfn_to_mfn(pfn) (pfn) +#define mfn_to_pfn(mfn) (mfn) +#define mfn_to_local_pfn(mfn) (mfn) +#define set_phys_to_machine(pfn, mfn) ((void)0) +#define phys_to_machine_mapping_valid(pfn) 1 +#define phys_to_machine(phys) ((maddr_t)(phys)) +#define machine_to_phys(mach) ((paddr_t)(mach)) +#define pfn_pte_ma(pfn, prot) pfn_pte(pfn, prot) +#define __pte_ma(x) __pte(x) + +#endif /* !CONFIG_XEN */ + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) phys_to_machine(__pa(v)) +#define virt_to_mfn(v) pfn_to_mfn(__pa(v) >> PAGE_SHIFT) +#define mfn_to_virt(m) __va(mfn_to_pfn(m) << PAGE_SHIFT) + +#endif /* _X86_MADDR_H */ diff --git a/arch/x86/include/mach-xen/asm/maddr_32.h b/arch/x86/include/mach-xen/asm/maddr_32.h new file mode 100644 index 0000000..de34d87 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/maddr_32.h @@ -0,0 +1,35 @@ +#ifndef _I386_MADDR_H +#define _I386_MADDR_H + +#ifdef CONFIG_X86_PAE +static inline paddr_t pte_phys_to_machine(paddr_t phys) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to pfn_to_mfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); + return machine; +} + +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + /* + * In PAE mode, the NX bit needs to be dealt with in the value + * passed to mfn_to_pfn(). On x86_64, we need to mask it off, + * but for i386 the conversion to ulong for the argument will + * clip it off. + */ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} +#else +#define pte_phys_to_machine phys_to_machine +#define pte_machine_to_phys machine_to_phys +#endif + +#endif /* _I386_MADDR_H */ diff --git a/arch/x86/include/mach-xen/asm/maddr_64.h b/arch/x86/include/mach-xen/asm/maddr_64.h new file mode 100644 index 0000000..e2c271e --- /dev/null +++ b/arch/x86/include/mach-xen/asm/maddr_64.h @@ -0,0 +1,21 @@ +#ifndef _X86_64_MADDR_H +#define _X86_64_MADDR_H + +static inline paddr_t pte_phys_to_machine(paddr_t phys) +{ + maddr_t machine; + machine = pfn_to_mfn((phys & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + machine = (machine << PAGE_SHIFT) | (phys & ~PHYSICAL_PAGE_MASK); + return machine; +} + +static inline paddr_t pte_machine_to_phys(maddr_t machine) +{ + paddr_t phys; + phys = mfn_to_pfn((machine & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT); + phys = (phys << PAGE_SHIFT) | (machine & ~PHYSICAL_PAGE_MASK); + return phys; +} + +#endif /* _X86_64_MADDR_H */ + diff --git a/arch/x86/include/mach-xen/asm/mmu_context.h b/arch/x86/include/mach-xen/asm/mmu_context.h new file mode 100644 index 0000000..1fbe9dd --- /dev/null +++ b/arch/x86/include/mach-xen/asm/mmu_context.h @@ -0,0 +1,165 @@ +#ifndef _ASM_X86_MMU_CONTEXT_H +#define _ASM_X86_MMU_CONTEXT_H + +#include +#include +#include +#include + +void arch_exit_mmap(struct mm_struct *mm); +void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); + +void mm_pin(struct mm_struct *mm); +void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); + +static inline void xen_activate_mm(struct mm_struct *prev, + struct mm_struct *next) +{ + if (!PagePinned(virt_to_page(next->pgd))) + mm_pin(next); +} + +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); + + +static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); +#endif +} + +#define prepare_arch_switch(next) __prepare_arch_switch() + +static inline void __prepare_arch_switch(void) +{ +#ifdef CONFIG_X86_32 + /* + * Save away %gs. No need to save %fs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. + */ + lazy_save_gs(current->thread.gs); + lazy_load_gs(__KERNEL_STACK_CANARY); +#else + /* + * Save away %es, %ds, %fs and %gs. Must happen before reload + * of cr3/ldt (i.e., not in __switch_to). + */ + __asm__ __volatile__ ( + "mov %%es,%0 ; mov %%ds,%1 ; mov %%fs,%2 ; mov %%gs,%3" + : "=m" (current->thread.es), + "=m" (current->thread.ds), + "=m" (current->thread.fsindex), + "=m" (current->thread.gsindex) ); + + if (current->thread.ds) + __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) ); + + if (current->thread.es) + __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) ); + + if (current->thread.fsindex) { + __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) ); + current->thread.fs = 0; + } + + if (current->thread.gsindex) { + load_gs_index(0); + current->thread.gs = 0; + } +#endif +} + +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + struct mmuext_op _op[2 + (sizeof(long) > 4)], *op = _op; +#ifdef CONFIG_X86_64 + pgd_t *upgd; +#endif + + if (likely(prev != next)) { + BUG_ON(!xen_feature(XENFEAT_writable_page_tables) && + !PagePinned(virt_to_page(next->pgd))); + +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + percpu_write(cpu_tlbstate.active_mm, next); +#endif + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* Re-load page tables: load_cr3(next->pgd) */ + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = virt_to_mfn(next->pgd); + op++; + + /* xen_new_user_pt(next->pgd) */ +#ifdef CONFIG_X86_64 + op->cmd = MMUEXT_NEW_USER_BASEPTR; + upgd = __user_pgd(next->pgd); + op->arg1.mfn = likely(upgd) ? virt_to_mfn(upgd) : 0; + op++; +#endif + + /* + * load the LDT, if the LDT is different: + */ + if (unlikely(prev->context.ldt != next->context.ldt)) { + /* load_LDT_nolock(&next->context) */ + op->cmd = MMUEXT_SET_LDT; + op->arg1.linear_addr = (unsigned long)next->context.ldt; + op->arg2.nr_ents = next->context.size; + op++; + } + + BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); + + /* stop TLB flushes for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + } +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) /* XEN: no lazy tlb */ + else { + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { + /* We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + */ + load_cr3(next->pgd); + xen_new_user_pt(next->pgd); + load_LDT_nolock(&next->context); + } + } +#endif +} + +#define activate_mm(prev, next) \ +do { \ + xen_activate_mm(prev, next); \ + switch_mm((prev), (next), NULL); \ +} while (0); + +#ifdef CONFIG_X86_32 +#define deactivate_mm(tsk, mm) \ +do { \ + lazy_load_gs(0); \ +} while (0) +#else +#define deactivate_mm(tsk, mm) \ +do { \ + load_gs_index(0); \ + loadsegment(fs, 0); \ +} while (0) +#endif + +#endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/mach-xen/asm/mutex.h b/arch/x86/include/mach-xen/asm/mutex.h new file mode 100644 index 0000000..ee9126e --- /dev/null +++ b/arch/x86/include/mach-xen/asm/mutex.h @@ -0,0 +1,3 @@ +#define arch_cpu_is_running(cpu) vcpu_running(cpu) + +#include_next diff --git a/arch/x86/include/mach-xen/asm/pci.h b/arch/x86/include/mach-xen/asm/pci.h new file mode 100644 index 0000000..54289aa --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pci.h @@ -0,0 +1,180 @@ +#ifndef _ASM_X86_PCI_H +#define _ASM_X86_PCI_H + +#include /* for struct page */ +#include +#include +#include +#include +#include +#include + +#ifdef __KERNEL__ + +struct pci_sysdata { + int domain; /* PCI domain */ + int node; /* NUMA node */ +#ifdef CONFIG_X86_64 + void *iommu; /* IOMMU private data */ +#endif +#ifdef CONFIG_XEN_PCIDEV_FRONTEND + struct pcifront_device *pdev; +#endif +}; + +extern int pci_routeirq; +extern int noioapicquirk; +extern int noioapicreroute; + +/* scan a bus after allocating a pci_sysdata for it */ +extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, + int node); +extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); + +#ifdef CONFIG_PCI + +#ifdef CONFIG_PCI_DOMAINS +static inline int pci_domain_nr(struct pci_bus *bus) +{ + struct pci_sysdata *sd = bus->sysdata; + return sd->domain; +} + +static inline int pci_proc_domain(struct pci_bus *bus) +{ + return pci_domain_nr(bus); +} +#endif + +/* Can be used to override the logic in pci_scan_bus for skipping + already-configured bus numbers - to be used for buggy BIOSes + or architectures with incomplete PCI setup by the loader */ + +extern unsigned int pcibios_assign_all_busses(void); +extern int pci_legacy_init(void); +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif +#else +# define pcibios_assign_all_busses() 0 +# define x86_default_pci_init NULL +#endif + +#include +#define pcibios_scan_all_fns(a, b) (!is_initial_xendomain()) + +extern unsigned long pci_mem_start; +#define PCIBIOS_MIN_IO 0x1000 +#define PCIBIOS_MIN_MEM (pci_mem_start) + +#define PCIBIOS_MIN_CARDBUS_IO 0x4000 + +extern int pcibios_enabled; +void pcibios_config_init(void); +struct pci_bus *pcibios_scan_root(int bus); + +void pcibios_set_master(struct pci_dev *dev); +void pcibios_penalize_isa_irq(int irq, int active); +struct irq_routing_table *pcibios_get_irq_routing_table(void); +int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); + + +#define HAVE_PCI_MMAP +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, + int write_combine); + + +#ifdef CONFIG_PCI +extern void early_quirks(void); +static inline void pci_dma_burst_advice(struct pci_dev *pdev, + enum pci_dma_burst_strategy *strat, + unsigned long *strategy_parameter) +{ + *strat = PCI_DMA_BURST_INFINITY; + *strategy_parameter = ~0UL; +} +#else +static inline void early_quirks(void) { } +#endif + +extern void pci_iommu_alloc(void); + +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) +/* MSI arch specific hooks */ +static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + return x86_msi.setup_msi_irqs(dev, nvec, type); +} + +static inline void x86_teardown_msi_irqs(struct pci_dev *dev) +{ + x86_msi.teardown_msi_irqs(dev); +} + +static inline void x86_teardown_msi_irq(unsigned int irq) +{ + x86_msi.teardown_msi_irq(irq); +} +static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq) +{ + x86_msi.restore_msi_irqs(dev, irq); +} +#define arch_setup_msi_irqs x86_setup_msi_irqs +#define arch_teardown_msi_irqs x86_teardown_msi_irqs +#define arch_teardown_msi_irq x86_teardown_msi_irq +#define arch_restore_msi_irqs x86_restore_msi_irqs +/* implemented in arch/x86/kernel/apic/io_apic. */ +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); +void native_teardown_msi_irq(unsigned int irq); +void native_restore_msi_irqs(struct pci_dev *dev, int irq); +/* default to the implementation in drivers/lib/msi.c */ +#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS +#define HAVE_DEFAULT_MSI_RESTORE_IRQS +void default_teardown_msi_irqs(struct pci_dev *dev); +void default_restore_msi_irqs(struct pci_dev *dev, int irq); +#else +#define native_setup_msi_irqs NULL +#define native_teardown_msi_irq NULL +#define default_teardown_msi_irqs NULL +#define default_restore_msi_irqs NULL +#endif + +#define PCI_DMA_BUS_IS_PHYS 0 + +#endif /* __KERNEL__ */ + +#ifdef CONFIG_X86_64 +#include "../../asm/pci_64.h" +#endif + +/* implement the pci_ DMA API in terms of the generic device dma_ one */ +#include + +/* generic pci stuff */ +#include +#define PCIBIOS_MAX_MEM_32 0xffffffff + +#ifdef CONFIG_NUMA +/* Returns the node based on pci bus */ +static inline int __pcibus_to_node(const struct pci_bus *bus) +{ + const struct pci_sysdata *sd = bus->sysdata; + + return sd->node; +} + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + int node; + + node = __pcibus_to_node(bus); + return (node == -1) ? cpu_online_mask : + cpumask_of_node(node); +} +#endif + +#endif /* _ASM_X86_PCI_H */ diff --git a/arch/x86/include/mach-xen/asm/percpu.h b/arch/x86/include/mach-xen/asm/percpu.h new file mode 100644 index 0000000..336a525 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/percpu.h @@ -0,0 +1,61 @@ +#ifndef _ASM_X86_XEN_PERCPU_H +#define _ASM_X86_XEN_PERCPU_H + +#include_next + +#define this_vcpu_read_1 this_cpu_read_1 +#define this_vcpu_read_2 this_cpu_read_2 +#define this_vcpu_read_4 this_cpu_read_4 + +#ifdef CONFIG_64BIT +# define this_vcpu_read_8 this_cpu_read_8 +#else +# define this_vcpu_read_8(pcp) ({ \ + typeof(pcp) res__; \ + __asm__ ("movl %%ebx,%%eax\n" \ + "movl %%ecx,%%edx\n" \ + "cmpxchg8b " __percpu_arg(1) \ + : "=&A" (res__) : "m" (pcp)); \ + res__; }) +#endif + +#define this_vcpu_read(pcp) __pcpu_size_call_return(this_vcpu_read_, pcp) + +#define percpu_exchange_op(op, var, val) \ +({ \ + typedef typeof(var) pxo_T__; \ + pxo_T__ pxo_ret__; \ + if (0) { \ + pxo_ret__ = (val); \ + (void)pxo_ret__; \ + } \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b %0,"__percpu_arg(1) \ + : "=q" (pxo_ret__), "+m" (var) \ + : "0" ((pxo_T__)(val))); \ + break; \ + case 2: \ + asm(op "w %0,"__percpu_arg(1) \ + : "=r" (pxo_ret__), "+m" (var) \ + : "0" ((pxo_T__)(val))); \ + break; \ + case 4: \ + asm(op "l %0,"__percpu_arg(1) \ + : "=r" (pxo_ret__), "+m" (var) \ + : "0" ((pxo_T__)(val))); \ + break; \ + case 8: \ + asm(op "q %0,"__percpu_arg(1) \ + : "=r" (pxo_ret__), "+m" (var) \ + : "0" ((pxo_T__)(val))); \ + break; \ + default: __bad_percpu_size(); \ + } \ + pxo_ret__; \ +}) + +#define percpu_xchg(var, val) percpu_exchange_op("xchg", var, val) +#define percpu_xadd(var, val) percpu_exchange_op("xadd", var, val) + +#endif /* _ASM_X86_XEN_PERCPU_H */ diff --git a/arch/x86/include/mach-xen/asm/perf_event.h b/arch/x86/include/mach-xen/asm/perf_event.h new file mode 100644 index 0000000..6c784d1 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/perf_event.h @@ -0,0 +1,42 @@ +#ifndef _ASM_X86_PERF_EVENT_H +#define _ASM_X86_PERF_EVENT_H + +#ifdef CONFIG_PERF_EVENTS + +/* + * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups. + * This flag is otherwise unused and ABI specified to be 0, so nobody should + * care what we do with it. + */ +#define PERF_EFLAGS_EXACT (1UL << 3) + +#define perf_instruction_pointer(regs) instruction_pointer(regs) + +#define perf_misc_flags(regs) ({ \ + struct pt_regs *_r_ = (regs); \ + unsigned long _f_ = user_mode(_r_) ? PERF_RECORD_MISC_USER \ + : PERF_RECORD_MISC_KERNEL; \ + _r_->flags & PERF_EFLAGS_EXACT ? _f_ | PERF_RECORD_MISC_EXACT_IP : _f_; \ +}) + +#include + +/* + * We abuse bit 3 from flags to pass exact information, see perf_misc_flags + * and the comment with PERF_EFLAGS_EXACT. + */ +#define perf_arch_fetch_caller_regs(regs, __ip) { \ + (regs)->ip = (__ip); \ + (regs)->bp = caller_frame_pointer(); \ + (regs)->cs = __KERNEL_CS; \ + regs->flags = 0; \ + asm volatile( \ + _ASM_MOV "%%"_ASM_SP ", %0\n" \ + : "=m" ((regs)->sp) \ + :: "memory" \ + ); \ +} + +#endif + +#endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/include/mach-xen/asm/pgalloc.h b/arch/x86/include/mach-xen/asm/pgalloc.h new file mode 100644 index 0000000..3879075 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pgalloc.h @@ -0,0 +1,159 @@ +#ifndef _ASM_X86_PGALLOC_H +#define _ASM_X86_PGALLOC_H + +#include +#include /* for struct page */ +#include + +#include /* for phys_to_virt and page_to_pseudophys */ + +static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } +static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {} + +static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, + unsigned long start, unsigned long count) {} +static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_release_pte(unsigned long pfn) {} +static inline void paravirt_release_pmd(unsigned long pfn) {} +static inline void paravirt_release_pud(unsigned long pfn) {} + +#ifdef CONFIG_X86_64 +void early_make_page_readonly(void *va, unsigned int feature); +pmd_t *early_get_pmd(unsigned long va); +#define make_lowmem_page_readonly make_page_readonly +#define make_lowmem_page_writable make_page_writable +#endif + +/* + * Flags to use when allocating a user page table page. + */ +extern gfp_t __userpte_alloc_gfp; + +/* + * Allocate and free page tables. + */ +extern pgd_t *pgd_alloc(struct mm_struct *); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); +extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long); + +/* Should really implement gc for free page table pages. This could be + done with a reference count in struct page. */ + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); + make_lowmem_page_writable(pte, XENFEAT_writable_page_tables); + free_page((unsigned long)pte); +} + +extern void __pte_free(pgtable_t); +static inline void pte_free(struct mm_struct *mm, struct page *pte) +{ + __pte_free(pte); +} + +extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, + unsigned long address) +{ + ___pte_free_tlb(tlb, pte); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, + pmd_t *pmd, pte_t *pte) +{ + paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + struct page *pte) +{ + unsigned long pfn = page_to_pfn(pte); + pmd_t ent = __pmd(((pmdval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE); + + paravirt_alloc_pte(mm, pfn); + if (PagePinned(virt_to_page(pmd))) { +#ifndef CONFIG_HIGHPTE + BUG_ON(PageHighMem(pte)); +#endif + set_pmd(pmd, ent); + } else + *pmd = ent; +} + +#define pmd_pgtable(pmd) pmd_page(pmd) + +#if PAGETABLE_LEVELS > 2 +extern pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr); +extern void __pmd_free(pgtable_t); + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pmd)); +} + +extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) +{ + ___pmd_free_tlb(tlb, pmd); +} + +#ifdef CONFIG_X86_PAE +extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); +#else /* !CONFIG_X86_PAE */ +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_t ent = __pud(_PAGE_TABLE | __pa(pmd)); + + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + if (PagePinned(virt_to_page(pud))) + set_pud(pud, ent); + else + *pud = ent; +} +#endif /* CONFIG_X86_PAE */ + +#if PAGETABLE_LEVELS > 3 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + pgd_t ent = __pgd(_PAGE_TABLE | __pa(pud)); + + paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); + if (unlikely(PagePinned(virt_to_page(pgd)))) + xen_l4_entry_update(pgd, ent); + else + *__user_pgd(pgd) = *pgd = ent; +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return (pud_t *)pmd_alloc_one(mm, addr); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); + __pmd_free(virt_to_page(pud)); +} + +extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + ___pud_free_tlb(tlb, pud); +} + +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +#endif /* _ASM_X86_PGALLOC_H */ diff --git a/arch/x86/include/mach-xen/asm/pgtable-3level.h b/arch/x86/include/mach-xen/asm/pgtable-3level.h new file mode 100644 index 0000000..71e906c --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pgtable-3level.h @@ -0,0 +1,152 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_H +#define _ASM_X86_PGTABLE_3LEVEL_H + +/* + * Intel Physical Address Extension (PAE) Mode - three-level page + * tables on PPro+ CPUs. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016Lx pfn %08lx).\n", \ + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016Lx pfn %08Lx).\n", \ + __FILE__, __LINE__, &(e), __pmd_val(e), \ + (pmd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016Lx pfn %08Lx).\n", \ + __FILE__, __LINE__, &(e), __pgd_val(e), \ + (pgd_val(e) & PTE_PFN_MASK) >> PAGE_SHIFT) + +/* Rules for using set_pte: the pte being assigned *must* be + * either not present or in a state where the hardware will + * not attempt to update the pte. In places where this is + * not possible, use pte_get_and_clear to obtain the old pte + * value and then use set_pte to update it. -ben + */ + +static inline void xen_set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} + +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + xen_l2_entry_update(pmdp, pmd); +} + +static inline void xen_set_pud(pud_t *pudp, pud_t pud) +{ + xen_l3_entry_update(pudp, pud); +} + +/* + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table + * entry, so clear the bottom half first and enforce ordering with a compiler + * barrier. + */ +static inline void __xen_pte_clear(pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +} + +#define xen_pmd_clear(pmd) \ +({ \ + pmd_t *__pmdp = (pmd); \ + PagePinned(virt_to_page(__pmdp)) \ + ? set_pmd(__pmdp, __pmd(0)) \ + : (void)(*__pmdp = __pmd(0)); \ +}) + +static inline void __xen_pud_clear(pud_t *pudp) +{ + set_pud(pudp, __pud(0)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + * + * Currently all places where pud_clear() is called either have + * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or + * pud_clear_bad()), so we don't need TLB flush here. + */ +} + +#define xen_pud_clear(pudp) \ +({ \ + pud_t *__pudp = (pudp); \ + PagePinned(virt_to_page(__pudp)) \ + ? __xen_pud_clear(__pudp) \ + : (void)(*__pudp = __pud(0)); \ +}) + +#ifdef CONFIG_SMP +static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res) +{ + uint64_t val = __pte_val(res); + if (__cmpxchg64(&ptep->pte, val, 0) != val) { + /* xchg acts as a barrier before the setting of the high bits */ + res.pte_low = xchg(&ptep->pte_low, 0); + res.pte_high = ptep->pte_high; + ptep->pte_high = 0; + } + return res; +} +#else +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) +#endif + +#define __pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \ + ((_pte).pte_high << (32-PAGE_SHIFT))) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_SMP +union split_pmd { + struct { + u32 pmd_low; + u32 pmd_high; + }; + pmd_t pmd; +}; +static inline pmd_t xen_pmdp_get_and_clear(pmd_t *pmdp) +{ + union split_pmd res, *orig = (union split_pmd *)pmdp; + + /* xchg acts as a barrier before setting of the high bits */ + res.pmd_low = xchg(&orig->pmd_low, 0); + res.pmd_high = orig->pmd_high; + orig->pmd_high = 0; + + return res.pmd; +} +#else +#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp) +#endif +#endif + +/* + * Bits 0, 6 and 7 are taken in the low part of the pte, + * put the 32 bits of offset into the high part. + */ +#define pte_to_pgoff(pte) ((pte).pte_high) +#define pgoff_to_pte(off) \ + ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } }) +#define PTE_FILE_MAX_BITS 32 + +/* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5) +#define __swp_type(x) (((x).val) & 0x1f) +#define __swp_offset(x) ((x).val >> 5) +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) +#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) + +#endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/mach-xen/asm/pgtable-3level_types.h b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h new file mode 100644 index 0000000..36d6f2b --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pgtable-3level_types.h @@ -0,0 +1,44 @@ +#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H +#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H + +#ifndef __ASSEMBLY__ +#include + +typedef u64 pteval_t; +typedef u64 pmdval_t; +typedef u64 pudval_t; +typedef u64 pgdval_t; +typedef u64 pgprotval_t; + +typedef union { + struct { + unsigned long pte_low, pte_high; + }; + pteval_t pte; +} pte_t; +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 + +#define PAGETABLE_LEVELS 3 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 30 +#define PTRS_PER_PGD 4 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + + +#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */ diff --git a/arch/x86/include/mach-xen/asm/pgtable.h b/arch/x86/include/mach-xen/asm/pgtable.h new file mode 100644 index 0000000..cd43083 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pgtable.h @@ -0,0 +1,885 @@ +#ifndef _ASM_X86_PGTABLE_H +#define _ASM_X86_PGTABLE_H + +#include +#include + +#include + +/* + * Macro to mark a page protection value as UC- + */ +#define pgprot_noncached(prot) \ + ((boot_cpu_data.x86 > 3) \ + ? (__pgprot(pgprot_val(prot) | _PAGE_CACHE_UC_MINUS)) \ + : (prot)) + +#ifndef __ASSEMBLY__ + +#include + +/* + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + +extern spinlock_t pgd_lock; +extern struct list_head pgd_list; + +extern struct mm_struct *pgd_page_get_mm(struct page *page); + +#define set_pte(ptep, pte) xen_set_pte(ptep, pte) +#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) +#define set_pmd_at(mm, addr, pmdp, pmd) xen_set_pmd_at(mm, addr, pmdp, pmd) + +#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) + +#ifndef __PAGETABLE_PUD_FOLDED +#define set_pgd(pgdp, pgd) xen_set_pgd(pgdp, pgd) +#define pgd_clear(pgd) xen_pgd_clear(pgd) +#endif + +#ifndef set_pud +# define set_pud(pudp, pud) xen_set_pud(pudp, pud) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pud_clear(pud) xen_pud_clear(pud) +#endif + +#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) +#define pmd_clear(pmd) xen_pmd_clear(pmd) + +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) +#define pmd_update(mm, addr, ptep) do { } while (0) +#define pmd_update_defer(mm, addr, ptep) do { } while (0) + +#define pgd_val(x) xen_pgd_val(x) +#define __pgd(x) xen_make_pgd(x) + +#ifndef __PAGETABLE_PUD_FOLDED +#define pud_val(x) xen_pud_val(x) +#define __pud(x) xen_make_pud(x) +#endif + +#ifndef __PAGETABLE_PMD_FOLDED +#define pmd_val(x) xen_pmd_val(x) +#define __pmd(x) xen_make_pmd(x) +#endif + +#define pte_val(x) xen_pte_val(x) +#define __pte(x) xen_make_pte(x) + +#define arch_end_context_switch(prev) do {} while(0) + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline int pte_dirty(pte_t pte) +{ + return pte_flags(pte) & _PAGE_DIRTY; +} + +static inline int pte_young(pte_t pte) +{ + return pte_flags(pte) & _PAGE_ACCESSED; +} + +static inline int pmd_young(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_ACCESSED; +} + +static inline int pte_write(pte_t pte) +{ + return pte_flags(pte) & _PAGE_RW; +} + +static inline int pte_file(pte_t pte) +{ + return pte_flags(pte) & _PAGE_FILE; +} + +static inline int pte_huge(pte_t pte) +{ + return pte_flags(pte) & _PAGE_PSE; +} + +static inline int pte_global(pte_t pte) +{ + return 0; +} + +static inline int pte_exec(pte_t pte) +{ + return !(pte_flags(pte) & _PAGE_NX); +} + +static inline int pte_special(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SPECIAL; +} + +#define pte_mfn(_pte) ((_pte).pte_low & _PAGE_PRESENT ? \ + __pte_mfn(_pte) : pfn_to_mfn(__pte_mfn(_pte))) +#define pte_pfn(_pte) ((_pte).pte_low & _PAGE_IOMAP ? max_mapnr : \ + (_pte).pte_low & _PAGE_PRESENT ? \ + mfn_to_local_pfn(__pte_mfn(_pte)) : \ + __pte_mfn(_pte)) + +#define pte_page(pte) pfn_to_page(pte_pfn(pte)) + +static inline unsigned long pmd_pfn(pmd_t pmd) +{ + return (pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT; +} + +static inline int pmd_large(pmd_t pte) +{ + return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_SPLITTING; +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PSE; +} + +static inline int has_transparent_hugepage(void) +{ + return cpu_has_pse; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static inline pte_t pte_set_flags(pte_t pte, pteval_t set) +{ + pteval_t v = __pte_val(pte); + + return __pte_ma(v | set); +} + +static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) +{ + pteval_t v = __pte_val(pte); + + return __pte_ma(v & ~clear); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_DIRTY); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_ACCESSED); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_RW); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_NX); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_ACCESSED); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_RW); +} + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_PSE); +} + +static inline pte_t pte_clrhuge(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_PSE); +} + +static inline pte_t pte_mkglobal(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_clrglobal(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SPECIAL); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v & ~clear); +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DIRTY); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_PSE); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} +#endif + +/* + * Mask out unsupported bits in a present pgprot. Non-present pgprots + * can use those bits for other purposes, so leave them be. + */ +static inline pgprotval_t massage_pgprot(pgprot_t pgprot) +{ + pgprotval_t protval = pgprot_val(pgprot); + + if (protval & _PAGE_PRESENT) + protval &= __supported_pte_mask; + + return protval; +} + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + return __pte(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + +static inline pte_t pfn_pte_ma(phys_addr_t page_nr, pgprot_t pgprot) +{ + return __pte_ma((page_nr << PAGE_SHIFT) | massage_pgprot(pgprot)); +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + return __pmd(((phys_addr_t)page_nr << PAGE_SHIFT) | + massage_pgprot(pgprot)); +} + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + pteval_t val = pte_val(pte) & _PAGE_CHG_MASK; + + val |= massage_pgprot(newprot) & ~_PAGE_CHG_MASK; + + return __pte(val); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pmdval_t val = pmd_val(pmd); + + val &= _HPAGE_CHG_MASK; + val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; + + return __pmd(val); +} +#endif + +/* mprotect needs to preserve PAT bits when updating vm_page_prot */ +#define pgprot_modify pgprot_modify +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK; + pgprotval_t addbits = pgprot_val(newprot); + return __pgprot(preservebits | addbits); +} + +#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK) + +#define canon_pgprot(p) __pgprot(massage_pgprot(p)) + +static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, + unsigned long flags, + unsigned long new_flags) +{ + /* + * PAT type is always WB for untracked ranges, so no need to check. + */ + if (x86_platform.is_untracked_pat_range(paddr, paddr + size)) + return 1; + + /* + * Certain new memtypes are not allowed with certain + * requested memtype: + * - request is uncached, return cannot be write-back + * - request is write-combine, return cannot be write-back + */ + if ((flags == _PAGE_CACHE_UC_MINUS && + new_flags == _PAGE_CACHE_WB) || + (flags == _PAGE_CACHE_WC && + new_flags == _PAGE_CACHE_WB)) { + return 0; + } + + return 1; +} + +pmd_t *populate_extra_pmd(unsigned long vaddr); +pte_t *populate_extra_pte(unsigned long vaddr); +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_X86_32 +# include "pgtable_32.h" +#else +# include "pgtable_64.h" +#endif + +#ifndef __ASSEMBLY__ +#include + +static inline int pte_none(pte_t pte) +{ + return !pte.pte; +} + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t a, pte_t b) +{ + return a.pte == b.pte; +} + +static inline int pte_present(pte_t a) +{ + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); +} + +static inline int pte_hidden(pte_t pte) +{ + return pte_flags(pte) & _PAGE_HIDDEN; +} + +static inline int pmd_present(pmd_t pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ + return __pmd_val(pmd) != 0; +#else + return pmd_flags(pmd) & _PAGE_PRESENT; +#endif +} + +static inline int pmd_none(pmd_t pmd) +{ + /* Only check low word on 32-bit platforms, since it might be + out of sync with upper half. */ + return (unsigned long)__pmd_val(pmd) == 0; +} + +static inline unsigned long pmd_page_vaddr(pmd_t pmd) +{ + return (unsigned long)__va(pmd_val(pmd) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) + +/* + * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] + * + * this macro returns the index of the entry in the pmd page which would + * control the given virtual address + */ +static inline unsigned long pmd_index(unsigned long address) +{ + return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); +} + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + * + * (Currently stuck as a macro because of indirect forward reference + * to linux/mm.h:page_to_nid()) + */ +#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) + +/* + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] + * + * this function returns the index of the entry in the pte page which would + * control the given virtual address + */ +static inline unsigned long pte_index(unsigned long address) +{ + return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); +} + +static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) +{ + return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); +} + +static inline int pmd_bad(pmd_t pmd) +{ +#if CONFIG_XEN_COMPAT <= 0x030002 + return (pmd_flags(pmd) & ~_PAGE_USER & ~_PAGE_PRESENT) + != (_KERNPG_TABLE & ~_PAGE_PRESENT); +#else + return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; +#endif +} + +static inline unsigned long pages_to_mb(unsigned long npg) +{ + return npg >> (20 - PAGE_SHIFT); +} + +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + direct_remap_pfn_range(vma, vaddr, pfn, size, prot, DOMID_IO) + +#if PAGETABLE_LEVELS > 2 +static inline int pud_none(pud_t pud) +{ + return __pud_val(pud) == 0; +} + +static inline int pud_present(pud_t pud) +{ + return pud_flags(pud) & _PAGE_PRESENT; +} + +static inline unsigned long pud_page_vaddr(pud_t pud) +{ + return (unsigned long)__va((unsigned long)pud_val(pud) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pud_page(pud) pfn_to_page(pud_val(pud) >> PAGE_SHIFT) + +/* Find an entry in the second-level page table.. */ +static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) +{ + return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); +} + +static inline int pud_large(pud_t pud) +{ + return (__pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) == + (_PAGE_PSE | _PAGE_PRESENT); +} + +static inline int pud_bad(pud_t pud) +{ + return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; +} +#else +static inline int pud_large(pud_t pud) +{ + return 0; +} +#endif /* PAGETABLE_LEVELS > 2 */ + +#if PAGETABLE_LEVELS > 3 +static inline int pgd_present(pgd_t pgd) +{ + return pgd_flags(pgd) & _PAGE_PRESENT; +} + +static inline unsigned long pgd_page_vaddr(pgd_t pgd) +{ + return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +/* to find an entry in a page-table-directory. */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) +{ + return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); +} + +static inline int pgd_bad(pgd_t pgd) +{ + return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; +} + +static inline int pgd_none(pgd_t pgd) +{ + return !__pgd_val(pgd); +} +#endif /* PAGETABLE_LEVELS > 3 */ + +#endif /* __ASSEMBLY__ */ + +/* + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] + * + * this macro returns the index of the entry in the pgd page which would + * control the given virtual address + */ +#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) + +/* + * pgd_offset() returns a (pgd_t *) + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; + */ +#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +/* + * a shortcut which implies the use of the kernel's pgd, instead + * of a process's + */ +#define pgd_offset_k(address) pgd_offset(&init_mm, (address)) + + +#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET) +#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY) + +#ifndef __ASSEMBLY__ + +#define direct_gbpages 0 + +/* local pte updates need not use xchg for locking */ +static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) +{ + xen_set_pte(ptep, __pte(0)); + return res; +} + +static inline pmd_t xen_local_pmdp_get_and_clear(pmd_t *pmdp) +{ + pmd_t res = *pmdp; + + xen_set_pmd(pmdp, __pmd(0)); + return res; +} + +static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep , pte_t pte) +{ + if ((mm != current->mm && mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, pte, 0)) + xen_set_pte(ptep, pte); +} + +static inline void xen_set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp , pmd_t pmd) +{ + xen_set_pmd(pmdp, pmd); +} + +static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + if ((mm != current->mm && mm != &init_mm) + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) + __xen_pte_clear(ptep); +} + +#ifndef CONFIG_PARAVIRT +/* + * Rules for using pte_update - it must be called after any PTE update which + * has not been done using the set_pte / clear_pte interfaces. It is used by + * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE + * updates should either be sets, clears, or set_pte_atomic for P->P + * transitions, which means this hook should only be called for user PTEs. + * This hook implies a P->P protection or access change has taken place, which + * requires a subsequent TLB flush. The notification can optionally be delayed + * until the TLB flush event by using the pte_update_defer form of the + * interface, but care must be taken to assure that the flush happens while + * still holding the same page table lock so that the shadow and primary pages + * do not become out of sync on SMP. + */ +#define pte_update(mm, addr, ptep) do { } while (0) +#define pte_update_defer(mm, addr, ptep) do { } while (0) +#endif + +/* + * We only update the dirty/accessed state if we set + * the dirty bit by hand in the kernel, since the hardware + * will do the accessed bit for us, and we don't want to + * race with other CPU's that might be updating the dirty + * bit at the same time. + */ +struct vm_area_struct; + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +extern int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +extern int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); + +#define __HAVE_ARCH_PTEP_CLEAR_FLUSH +#define ptep_clear_flush(vma, addr, ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!pte_none(__res) && \ + ((vma)->vm_mm != current->mm || \ + HYPERVISOR_update_va_mapping(addr, __pte(0), \ + uvm_multi(mm_cpumask((vma)->vm_mm)) | \ + UVMF_INVLPG))) { \ + __xen_pte_clear(__ptep); \ + flush_tlb_page(vma, addr); \ + } \ + __res; \ +}) + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_t pte = *ptep; + if (!pte_none(pte) + && (mm != &init_mm + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { + pte = xen_ptep_get_and_clear(ptep, pte); + pte_update(mm, addr, ptep); + } + return pte; +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +#define ptep_get_and_clear_full(mm, addr, ptep, full) \ + ((full) ? ({ \ + pte_t *__ptep = (ptep); \ + pte_t __res = *__ptep; \ + if (!PagePinned(virt_to_page((mm)->pgd))) \ + __xen_pte_clear(__ptep); \ + else if (!pte_none(__res)) \ + xen_l1_entry_update(__ptep, __pte(0)); \ + __res; \ + }) : \ + ptep_get_and_clear(mm, addr, ptep)) + +pte_t xen_ptep_get_and_clear_full(struct vm_area_struct *, unsigned long, pte_t *, int); + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = *ptep; + if (pte_write(pte)) + set_pte_at(mm, addr, ptep, pte_wrprotect(pte)); +} + +#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) + +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = xen_pmdp_get_and_clear(pmdp); + pmd_update(mm, addr, pmdp); + return pmd; +} +#endif + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); + pmd_update(mm, addr, pmdp); +} +#endif + +/* + * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); + * + * dst - pointer to pgd range anwhere on a pgd page + * src - "" + * count - the number of pgds to copy. + * + * dst and src can be on the same page, but the range must not overlap, + * and must not cross a page boundary. + */ +static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) +{ + memcpy(dst, src, count * sizeof(pgd_t)); +} + +#define arbitrary_virt_to_mfn(va) \ +({ \ + unsigned int __lvl; \ + pte_t *__ptep = lookup_address((unsigned long)(va), &__lvl); \ + BUG_ON(!__ptep || __lvl != PG_LEVEL_4K || !pte_present(*__ptep));\ + pte_mfn(*__ptep); \ +}) + +#define arbitrary_virt_to_machine(va) \ + (((maddr_t)arbitrary_virt_to_mfn(va) << PAGE_SHIFT) \ + | ((unsigned long)(va) & (PAGE_SIZE - 1))) + +#ifdef CONFIG_HIGHPTE +#include +struct page *kmap_atomic_to_page(void *); +#define ptep_to_machine(ptep) \ +({ \ + pte_t *__ptep = (ptep); \ + page_to_phys(kmap_atomic_to_page(__ptep)) \ + | ((unsigned long)__ptep & (PAGE_SIZE - 1)); \ +}) +#else +#define ptep_to_machine(ptep) virt_to_machine(ptep) +#endif + +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ +#if CONFIG_XEN_COMPAT < 0x030300 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) + return ptep_get_and_clear(mm, addr, ptep); +#endif + return *ptep; +} + +static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + mmu_update_t u; + +#if CONFIG_XEN_COMPAT < 0x030300 + if (unlikely(!xen_feature(XENFEAT_mmu_pt_update_preserve_ad))) { + set_pte_at(mm, addr, ptep, pte); + return; + } +#endif + u.ptr = ptep_to_machine(ptep) | MMU_PT_UPDATE_PRESERVE_AD; + u.val = __pte_val(pte); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF)) + BUG(); +} + +#include + +#include +void make_page_readonly(void *va, unsigned int feature); +void make_page_writable(void *va, unsigned int feature); +void make_pages_readonly(void *va, unsigned int nr, unsigned int feature); +void make_pages_writable(void *va, unsigned int nr, unsigned int feature); + +struct vm_area_struct; + +int direct_remap_pfn_range(struct vm_area_struct *vma, + unsigned long address, + phys_addr_t mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int direct_kernel_remap_pfn_range(unsigned long address, + unsigned long mfn, + unsigned long size, + pgprot_t prot, + domid_t domid); +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + uint64_t *ptep); + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_H */ diff --git a/arch/x86/include/mach-xen/asm/pgtable_32.h b/arch/x86/include/mach-xen/asm/pgtable_32.h new file mode 100644 index 0000000..7d89873 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pgtable_32.h @@ -0,0 +1,89 @@ +#ifndef _ASM_X86_PGTABLE_32_H +#define _ASM_X86_PGTABLE_32_H + +#include + +/* + * The Linux memory management assumes a three-level page table setup. On + * the i386, we use that, but "fold" the mid level into the top-level page + * table, so that we physically have the same two-level page table as the + * i386 mmu expects. + * + * This file contains the functions and defines necessary to modify and use + * the i386 page table tree. + */ +#ifndef __ASSEMBLY__ +#include +#include +#include + +#include +#include +#include +#include + +struct vm_area_struct; + +extern pgd_t *swapper_pg_dir; +extern pgd_t initial_page_table[1024]; + +static inline void pgtable_cache_init(void) { } +static inline void check_pgt_cache(void) { } +void paging_init(void); + +extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); + + +/* + * Define this if things work differently on an i386 and an i486: + * it will (on an i486) warn about kernel memory accesses that are + * done without a 'access_ok(VERIFY_WRITE,..)' + */ +#undef TEST_ACCESS_OK + +#ifdef CONFIG_X86_PAE +# include +#else +# include +#endif + +#if defined(CONFIG_HIGHPTE) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic_pte(pmd_page(*(dir))) + \ + pte_index((address))) +#define pte_unmap(pte) kunmap_atomic((pte)) +#else +#define pte_offset_map(dir, address) \ + ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) +#define pte_unmap(pte) do { } while (0) +#endif + +/* Clear a kernel PTE and flush it from the TLB */ +#define kpte_clear_flush(ptep, vaddr) \ +do { \ + if (HYPERVISOR_update_va_mapping(vaddr, __pte(0), UVMF_INVLPG)) \ + BUG(); \ +} while (0) + +/* + * The i386 doesn't have any external MMU info: the kernel page + * tables contain all the necessary information. + */ +#define update_mmu_cache(vma, address, ptep) do { } while (0) + +void make_lowmem_page_readonly(void *va, unsigned int feature); +void make_lowmem_page_writable(void *va, unsigned int feature); + +#endif /* !__ASSEMBLY__ */ + +/* + * kern_addr_valid() is (1) for FLATMEM and (0) for + * SPARSEMEM and DISCONTIGMEM + */ +#ifdef CONFIG_FLATMEM +#define kern_addr_valid(addr) (1) +#else +#define kern_addr_valid(kaddr) (0) +#endif + +#endif /* _ASM_X86_PGTABLE_32_H */ diff --git a/arch/x86/include/mach-xen/asm/pgtable_64.h b/arch/x86/include/mach-xen/asm/pgtable_64.h new file mode 100644 index 0000000..f58b2ef --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pgtable_64.h @@ -0,0 +1,203 @@ +#ifndef _ASM_X86_PGTABLE_64_H +#define _ASM_X86_PGTABLE_64_H + +#include +#include + +#ifndef __ASSEMBLY__ + +/* + * This file contains the functions and defines necessary to modify and use + * the x86-64 page table tree. + */ +#include +#include +#include +#include + +#ifdef CONFIG_XEN +extern pud_t level3_user_pgt[512]; + +extern void xen_init_pt(void); +extern void xen_switch_pt(void); +#endif + +extern pud_t level3_kernel_pgt[512]; +extern pud_t level3_ident_pgt[512]; +extern pmd_t level2_kernel_pgt[512]; +extern pmd_t level2_fixmap_pgt[512]; +extern pmd_t level2_ident_pgt[512]; +extern pgd_t init_level4_pgt[]; + +#define swapper_pg_dir init_level4_pgt + +extern void paging_init(void); + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pte_val(e), pte_pfn(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016lx pfn %010lx).\n", \ + __FILE__, __LINE__, &(e), __pmd_val(e), pmd_pfn(e)) +#define pud_ERROR(e) \ + printk("%s:%d: bad pud %p(%016lx pfn %010Lx).\n", \ + __FILE__, __LINE__, &(e), __pud_val(e), \ + (pud_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016lx pfn %010Lx).\n", \ + __FILE__, __LINE__, &(e), __pgd_val(e), \ + (pgd_val(e) & __PHYSICAL_MASK) >> PAGE_SHIFT) + +struct mm_struct; + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); + + +#define __xen_pte_clear(ptep) xen_set_pte(ptep, __pte(0)) + +static inline void xen_set_pte(pte_t *ptep, pte_t pte) +{ + *ptep = pte; +} + +static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + xen_l2_entry_update(pmdp, pmd); +} + +#define xen_pmd_clear(pmd) \ +({ \ + pmd_t *__pmdp = (pmd); \ + PagePinned(virt_to_page(__pmdp)) \ + ? set_pmd(__pmdp, xen_make_pmd(0)) \ + : (void)(*__pmdp = xen_make_pmd(0)); \ +}) + +#ifdef CONFIG_SMP +static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t ret) +{ + return __pte_ma(xchg(&xp->pte, 0)); +} +#else +#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#ifdef CONFIG_SMP +static inline pmd_t xen_pmdp_get_and_clear(pmd_t *xp) +{ + return xen_make_pmd(xchg(&xp->pmd, 0)); +} +#else +#define xen_pmdp_get_and_clear(xp) xen_local_pmdp_get_and_clear(xp) +#endif +#endif + +static inline void xen_set_pud(pud_t *pudp, pud_t pud) +{ + xen_l3_entry_update(pudp, pud); +} + +#define xen_pud_clear(pud) \ +({ \ + pud_t *__pudp = (pud); \ + PagePinned(virt_to_page(__pudp)) \ + ? set_pud(__pudp, xen_make_pud(0)) \ + : (void)(*__pudp = xen_make_pud(0)); \ +}) + +static inline pgd_t *__user_pgd(pgd_t *pgd) +{ + if (unlikely(((unsigned long)pgd & PAGE_MASK) + == (unsigned long)init_level4_pgt)) + return NULL; + return (pgd_t *)(virt_to_page(pgd)->private + + ((unsigned long)pgd & ~PAGE_MASK)); +} + +static inline void xen_set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + xen_l4_entry_update(pgdp, pgd); +} + +#define xen_pgd_clear(pgd) \ +({ \ + pgd_t *__pgdp = (pgd); \ + PagePinned(virt_to_page(__pgdp)) \ + ? xen_l4_entry_update(__pgdp, xen_make_pgd(0)) \ + : (void)(*__user_pgd(__pgdp) = *__pgdp = xen_make_pgd(0)); \ +}) + +#define __pte_mfn(_pte) (((_pte).pte & PTE_PFN_MASK) >> PAGE_SHIFT) + +extern unsigned long early_arbitrary_virt_to_mfn(void *va); + +extern void sync_global_pgds(unsigned long start, unsigned long end); + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ + +/* + * Level 4 access. + */ +static inline int pgd_large(pgd_t pgd) { return 0; } +#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) + +/* PUD - Level3 access */ + +/* PMD - Level 2 access */ +#define pte_to_pgoff(pte) ((__pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) +#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \ + _PAGE_FILE }) +#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT + +/* PTE - Level 1 access. */ + +/* x86-64 always has all page tables mapped. */ +#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) +#define pte_unmap(pte) ((void)(pte))/* NOP */ + +#define update_mmu_cache(vma, address, ptep) do { } while (0) + +/* Encode and de-code a swap entry */ +#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE +#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#else +#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) +#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) +#endif + +#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) + +#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ + & ((1U << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << (_PAGE_BIT_PRESENT + 1)) \ + | ((offset) << SWP_OFFSET_SHIFT) }) +#define __pte_to_swp_entry(pte) ((swp_entry_t) { __pte_val(pte) }) +#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) + +extern int kern_addr_valid(unsigned long addr); + +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + +#define pgtable_cache_init() do { } while (0) +#define check_pgt_cache() do { } while (0) + +#define PAGE_AGP PAGE_KERNEL_NOCACHE +#define HAVE_PAGE_AGP 1 + +/* fs/proc/kcore.c */ +#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) +#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) + +#define __HAVE_ARCH_PTE_SAME + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/include/mach-xen/asm/pgtable_64_types.h b/arch/x86/include/mach-xen/asm/pgtable_64_types.h new file mode 100644 index 0000000..c4c4665 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pgtable_64_types.h @@ -0,0 +1,64 @@ +#ifndef _ASM_X86_PGTABLE_64_DEFS_H +#define _ASM_X86_PGTABLE_64_DEFS_H + +#ifndef __ASSEMBLY__ +#include + +/* + * These are used to make use of C type-checking.. + */ +typedef unsigned long pteval_t; +typedef unsigned long pmdval_t; +typedef unsigned long pudval_t; +typedef unsigned long pgdval_t; +typedef unsigned long pgprotval_t; + +typedef union { pteval_t pte; unsigned int pte_low; } pte_t; + +#endif /* !__ASSEMBLY__ */ + +#define SHARED_KERNEL_PMD 0 +#define PAGETABLE_LEVELS 4 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 39 +#define PTRS_PER_PGD 512 + +/* + * 3rd level page + */ +#define PUD_SHIFT 30 +#define PTRS_PER_PUD 512 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE - 1)) +#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE - 1)) +#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE - 1)) + +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +#define MAX_PHYSMEM_BITS 43 +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define VMALLOC_START _AC(0xffffc90000000000, UL) +#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#define MODULES_VADDR _AC(0xffffffffa0000000, UL) +#define MODULES_END _AC(0xffffffffff000000, UL) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) + +#endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/include/mach-xen/asm/pgtable_types.h b/arch/x86/include/mach-xen/asm/pgtable_types.h new file mode 100644 index 0000000..d0ca475 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/pgtable_types.h @@ -0,0 +1,392 @@ +#ifndef _ASM_X86_PGTABLE_DEFS_H +#define _ASM_X86_PGTABLE_DEFS_H + +#include +#include + +#define FIRST_USER_ADDRESS 0 + +#define _PAGE_BIT_PRESENT 0 /* is present */ +#define _PAGE_BIT_RW 1 /* writeable */ +#define _PAGE_BIT_USER 2 /* userspace addressable */ +#define _PAGE_BIT_PWT 3 /* page write through */ +#define _PAGE_BIT_PCD 4 /* page cache disabled */ +#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ +#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ +#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ +#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ +#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ +#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ +#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ +#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ + +/* If _PAGE_BIT_PRESENT is clear, we use these: */ +/* - if the user mapped it with PROT_NONE; pte_present gives true */ +#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL +/* - set: nonlinear file mapping, saved PTE; unset:swap */ +#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY + +#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) +#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) +#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) +#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) +#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) +#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) +#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) +#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) +#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) +#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) +#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) +#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) +#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) +#define __HAVE_ARCH_PTE_SPECIAL + +#ifdef CONFIG_KMEMCHECK +#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) +#else +#define _PAGE_HIDDEN (_AT(pteval_t, 0)) +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) +#else +#define _PAGE_NX (_AT(pteval_t, 0)) +#endif + +#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) +#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) + +#ifndef __ASSEMBLY__ +#if defined(CONFIG_X86_64) && CONFIG_XEN_COMPAT <= 0x030002 +extern unsigned int __kernel_page_user; +#else +#define __kernel_page_user 0 +#endif +#endif + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ + _PAGE_DIRTY | __kernel_page_user) + +/* Set of bits not changed in pte_modify */ +#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_CACHE_MASK | _PAGE_IOMAP | \ + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) + +/* + * PAT settings are part of the hypervisor interface, which sets the + * MSR to 0x050100070406 (i.e. WB, WT, UC-, UC, WC, WP [, UC, UC]). + */ +#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT | _PAGE_PAT) +#define _PAGE_CACHE_WB (0) +#define _PAGE_CACHE_WT (_PAGE_PWT) +#define _PAGE_CACHE_WC (_PAGE_PAT) +#define _PAGE_CACHE_WP (_PAGE_PAT | _PAGE_PWT) +#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD) +#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) + +#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ + _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) +#define PAGE_COPY PAGE_COPY_NOEXEC +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED | _PAGE_NX) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ + _PAGE_ACCESSED) + +#define __PAGE_KERNEL_EXEC \ + (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | __kernel_page_user) +#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) + +#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) +#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) +#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC) +#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) +#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) +#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) +#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) +#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) + +#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP) +#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP) + +#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) +#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) +#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) +#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) +#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC) +#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) +#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS) +#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE) +#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) +#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) +#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE) + +#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) +#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) +#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS) +#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC) + +/* xwr */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC + +/* + * early identity mapping pte attrib macros. + */ +#ifdef CONFIG_X86_64 +#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#else +/* + * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection + * bits are combined, this will alow user to access the high address mapped + * VDSO in the presence of CONFIG_COMPAT_VDSO + */ +#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ +#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ +#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ +#endif + +#ifdef CONFIG_X86_32 +# include +#else +# include "pgtable_64_types.h" +#endif + +#ifndef __ASSEMBLY__ + +#include + +/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ +#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) + +/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ +#define PTE_FLAGS_MASK (~PTE_PFN_MASK) + +typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; + +#include + +typedef struct { pgdval_t pgd; } pgd_t; + +#define __pgd_ma(x) ((pgd_t) { (x) } ) +static inline pgd_t xen_make_pgd(pgdval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pgd_t) { val }; +} + +#define __pgd_val(x) ((x).pgd) +static inline pgdval_t xen_pgd_val(pgd_t pgd) +{ + pgdval_t ret = __pgd_val(pgd); +#if PAGETABLE_LEVELS == 2 && CONFIG_XEN_COMPAT <= 0x030002 + if (likely(ret)) + ret = machine_to_phys(ret) | _PAGE_PRESENT; +#else + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); +#endif + return ret; +} + +static inline pgdval_t pgd_flags(pgd_t pgd) +{ + return __pgd_val(pgd) & PTE_FLAGS_MASK; +} + +#if PAGETABLE_LEVELS > 3 +typedef struct { pudval_t pud; } pud_t; + +#define __pud_ma(x) ((pud_t) { (x) } ) +static inline pud_t xen_make_pud(pudval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pud_t) { val }; +} + +#define __pud_val(x) ((x).pud) +static inline pudval_t xen_pud_val(pud_t pud) +{ + pudval_t ret = __pud_val(pud); + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); + return ret; +} +#else +#include + +#define __pud_val(x) __pgd_val((x).pgd) +static inline pudval_t xen_pud_val(pud_t pud) +{ + return xen_pgd_val(pud.pgd); +} +#endif + +#if PAGETABLE_LEVELS > 2 +typedef struct { pmdval_t pmd; } pmd_t; + +#define __pmd_ma(x) ((pmd_t) { (x) } ) +static inline pmd_t xen_make_pmd(pmdval_t val) +{ + if (likely(val & _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pmd_t) { val }; +} + +#define __pmd_val(x) ((x).pmd) +static inline pmdval_t xen_pmd_val(pmd_t pmd) +{ + pmdval_t ret = __pmd_val(pmd); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (likely(ret)) + ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; +#else + if (likely(ret & _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); +#endif + return ret; +} +#else +#include + +#define __pmd_ma(x) ((pmd_t) { .pud.pgd = __pgd_ma(x) } ) +#define __pmd_val(x) __pgd_val((x).pud.pgd) +static inline pmdval_t xen_pmd_val(pmd_t pmd) +{ + return xen_pgd_val(pmd.pud.pgd); +} +#endif + +static inline pudval_t pud_flags(pud_t pud) +{ + return __pud_val(pud) & PTE_FLAGS_MASK; +} + +static inline pmdval_t pmd_flags(pmd_t pmd) +{ + return __pmd_val(pmd) & PTE_FLAGS_MASK; +} + +#define __pte_ma(x) ((pte_t) { .pte = (x) } ) +static inline pte_t xen_make_pte(pteval_t val) +{ + if (likely((val & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT)) + val = pte_phys_to_machine(val); + return (pte_t) { .pte = val }; +} + +#define __pte_val(x) ((x).pte) +static inline pteval_t xen_pte_val(pte_t pte) +{ + pteval_t ret = __pte_val(pte); + if (likely((pte.pte_low & (_PAGE_PRESENT|_PAGE_IOMAP)) == _PAGE_PRESENT)) + ret = pte_machine_to_phys(ret); + return ret; +} + +static inline pteval_t pte_flags(pte_t pte) +{ + return __pte_val(pte) & PTE_FLAGS_MASK; +} + +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) } ) + + +typedef struct page *pgtable_t; + +extern pteval_t __supported_pte_mask; +extern void set_nx(void); +extern int nx_enabled; + +#define pgprot_writecombine pgprot_writecombine +extern pgprot_t pgprot_writecombine(pgprot_t prot); + +#ifndef CONFIG_XEN +/* Indicate that x86 has its own track and untrack pfn vma functions */ +#define __HAVE_PFNMAP_TRACKING +#endif + +#define __HAVE_PHYS_MEM_ACCESS_PROT +struct file; +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot); +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot); + +/* Install a pte for a particular vaddr in kernel space. */ +void set_pte_vaddr(unsigned long vaddr, pte_t pte); + +extern void xen_pagetable_reserve(u64 start, u64 end); + +struct seq_file; +extern void arch_report_meminfo(struct seq_file *m); + +enum { + PG_LEVEL_NONE, + PG_LEVEL_4K, + PG_LEVEL_2M, + PG_LEVEL_1G, + PG_LEVEL_NUM +}; + +#ifdef CONFIG_PROC_FS +extern void update_page_count(int level, unsigned long pages); +#else +static inline void update_page_count(int level, unsigned long pages) { } +#endif + +/* + * Helper function that returns the kernel pagetable entry controlling + * the virtual address 'address'. NULL means no pagetable entry present. + * NOTE: the return type is pte_t but if the pmd is PSE then we return it + * as a pte too. + */ +extern pte_t *lookup_address(unsigned long address, unsigned int *level); + +#endif /* !__ASSEMBLY__ */ + +#endif /* _ASM_X86_PGTABLE_DEFS_H */ diff --git a/arch/x86/include/mach-xen/asm/probe_roms.h b/arch/x86/include/mach-xen/asm/probe_roms.h new file mode 100644 index 0000000..da90d01 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/probe_roms.h @@ -0,0 +1,10 @@ +#if !defined(CONFIG_XEN_UNPRIVILEGED_GUEST) +# include_next +#elif !defined(_PROBE_ROMS_H_) +# define _PROBE_ROMS_H_ +struct pci_dev; + +static inline void __iomem *pci_map_biosrom(struct pci_dev *pdev) { return NULL; } +static inline void pci_unmap_biosrom(void __iomem *rom) { } +static inline size_t pci_biosrom_size(struct pci_dev *pdev) { return 0; } +#endif diff --git a/arch/x86/include/mach-xen/asm/processor.h b/arch/x86/include/mach-xen/asm/processor.h new file mode 100644 index 0000000..242e146 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/processor.h @@ -0,0 +1,978 @@ +#ifndef _ASM_X86_PROCESSOR_H +#define _ASM_X86_PROCESSOR_H + +#include + +/* Forward declaration, a strange C thing */ +struct task_struct; +struct mm_struct; + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define HBP_NUM 4 +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). + */ +static inline void *current_text_addr(void) +{ + void *pc; + + asm volatile("mov $1f, %0; 1:":"=r" (pc)); + + return pc; +} + +#ifdef CONFIG_X86_VSMP +# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) +# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) +#else +# define ARCH_MIN_TASKALIGN 16 +# define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif + +/* + * CPU type and hardware bug flags. Kept separately for each CPU. + * Members of this structure are referenced in head.S, so think twice + * before touching them. [mj] + */ + +struct cpuinfo_x86 { + __u8 x86; /* CPU family */ + __u8 x86_vendor; /* CPU vendor */ + __u8 x86_model; + __u8 x86_mask; +#ifdef CONFIG_X86_32 + char wp_works_ok; /* It doesn't on 386's */ + + /* Problems on some 486Dx4's and old 386's: */ +#ifndef CONFIG_XEN + char hlt_works_ok; +#endif + char hard_math; +#ifndef CONFIG_XEN + char rfu; + char fdiv_bug; + char f00f_bug; + char coma_bug; + char pad0; +#endif +#else + /* Number of 4K pages in DTLB/ITLB combined(in pages): */ + int x86_tlbsize; +#endif + __u8 x86_virt_bits; + __u8 x86_phys_bits; +#ifndef CONFIG_XEN + /* CPUID returned core id bits: */ + __u8 x86_coreid_bits; +#endif + /* Max extended CPUID function supported: */ + __u32 extended_cpuid_level; + /* Maximum supported CPUID level, -1=no CPUID: */ + int cpuid_level; + __u32 x86_capability[NCAPINTS]; + char x86_vendor_id[16]; + char x86_model_id[64]; + /* in KB - valid for CPUS which support this call: */ + int x86_cache_size; + int x86_cache_alignment; /* In bytes */ + int x86_power; + unsigned long loops_per_jiffy; +#ifndef CONFIG_XEN + /* cpuid returned max cores value: */ + u16 x86_max_cores; + u16 apicid; + u16 initial_apicid; +#endif + u16 x86_clflush_size; +#ifndef CONFIG_XEN + /* number of cores as seen by the OS: */ + u16 booted_cores; + /* Physical processor id: */ + u16 phys_proc_id; + /* Core id: */ + u16 cpu_core_id; + /* Compute unit id */ + u8 compute_unit_id; +#endif + /* Index into per_cpu list: */ + u16 cpu_index; +#ifndef CONFIG_XEN + u32 microcode; +#endif +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +#define X86_VENDOR_INTEL 0 +#define X86_VENDOR_CYRIX 1 +#define X86_VENDOR_AMD 2 +#define X86_VENDOR_UMC 3 +#define X86_VENDOR_CENTAUR 5 +#define X86_VENDOR_TRANSMETA 7 +#define X86_VENDOR_NSC 8 +#define X86_VENDOR_NUM 9 + +#define X86_VENDOR_UNKNOWN 0xff + +/* + * capabilities of CPUs + */ +extern struct cpuinfo_x86 boot_cpu_data; +extern struct cpuinfo_x86 new_cpu_data; + +extern __u32 cpu_caps_cleared[NCAPINTS]; +extern __u32 cpu_caps_set[NCAPINTS]; + +#ifdef CONFIG_SMP +DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +#define cpu_data(cpu) per_cpu(cpu_info, cpu) +#else +#define cpu_info boot_cpu_data +#define cpu_data(cpu) boot_cpu_data +#endif + +extern const struct seq_operations cpuinfo_op; + +static inline int hlt_works(int cpu) +{ +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + return cpu_data(cpu).hlt_works_ok; +#else + return 1; +#endif +} + +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) + +extern void cpu_detect(struct cpuinfo_x86 *c); + +extern struct pt_regs *idle_regs(struct pt_regs *); + +extern void early_cpu_init(void); +extern void identify_boot_cpu(void); +extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void print_cpu_info(struct cpuinfo_x86 *); +extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); +extern unsigned short num_cache_leaves; + +extern void detect_extended_topology(struct cpuinfo_x86 *c); +extern void detect_ht(struct cpuinfo_x86 *c); + +static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile(XEN_CPUID + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx) + : "memory"); +} + +static inline void load_cr3(pgd_t *pgdir) +{ + write_cr3(__pa(pgdir)); +} + +#ifndef CONFIG_X86_NO_TSS +#ifdef CONFIG_X86_32 +/* This is the TSS defined by the hardware. */ +struct x86_hw_tss { + unsigned short back_link, __blh; + unsigned long sp0; + unsigned short ss0, __ss0h; + unsigned long sp1; + /* ss1 caches MSR_IA32_SYSENTER_CS: */ + unsigned short ss1, __ss1h; + unsigned long sp2; + unsigned short ss2, __ss2h; + unsigned long __cr3; + unsigned long ip; + unsigned long flags; + unsigned long ax; + unsigned long cx; + unsigned long dx; + unsigned long bx; + unsigned long sp; + unsigned long bp; + unsigned long si; + unsigned long di; + unsigned short es, __esh; + unsigned short cs, __csh; + unsigned short ss, __ssh; + unsigned short ds, __dsh; + unsigned short fs, __fsh; + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace; + unsigned short io_bitmap_base; + +} __attribute__((packed)); +extern struct tss_struct doublefault_tss; +#else +struct x86_hw_tss { + u32 reserved1; + u64 sp0; + u64 sp1; + u64 sp2; + u64 reserved2; + u64 ist[7]; + u32 reserved3; + u32 reserved4; + u16 reserved5; + u16 io_bitmap_base; + +} __attribute__((packed)) ____cacheline_aligned; +#endif +#endif /* CONFIG_X86_NO_TSS */ + +/* + * IO-bitmap sizes: + */ +#define IO_BITMAP_BITS 65536 +#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) +#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) +#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define INVALID_IO_BITMAP_OFFSET 0x8000 + +#ifndef CONFIG_X86_NO_TSS +struct tss_struct { + /* + * The hardware state: + */ + struct x86_hw_tss x86_tss; + + /* + * The extra 1 is there because the CPU will access an + * additional byte beyond the end of the IO permission + * bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; + + /* + * .. and then another 0x100 bytes for the emergency kernel stack: + */ + unsigned long stack[64]; + +} ____cacheline_aligned; + +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); + +/* + * Save the original ist values for checking stack pointers during debugging + */ +struct orig_ist { + unsigned long ist[7]; +}; +#endif /* CONFIG_X86_NO_TSS */ + +#define MXCSR_DEFAULT 0x1f80 + +struct i387_fsave_struct { + u32 cwd; /* FPU Control Word */ + u32 swd; /* FPU Status Word */ + u32 twd; /* FPU Tag Word */ + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Pointer Offset */ + u32 fos; /* FPU Operand Pointer Selector */ + + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + + /* Software status information [not touched by FSAVE ]: */ + u32 status; +}; + +struct i387_fxsave_struct { + u16 cwd; /* Control Word */ + u16 swd; /* Status Word */ + u16 twd; /* Tag Word */ + u16 fop; /* Last Instruction Opcode */ + union { + struct { + u64 rip; /* Instruction Pointer */ + u64 rdp; /* Data Pointer */ + }; + struct { + u32 fip; /* FPU IP Offset */ + u32 fcs; /* FPU IP Selector */ + u32 foo; /* FPU Operand Offset */ + u32 fos; /* FPU Operand Selector */ + }; + }; + u32 mxcsr; /* MXCSR Register State */ + u32 mxcsr_mask; /* MXCSR Mask */ + + /* 8*16 bytes for each FP-reg = 128 bytes: */ + u32 st_space[32]; + + /* 16*16 bytes for each XMM-reg = 256 bytes: */ + u32 xmm_space[64]; + + u32 padding[12]; + + union { + u32 padding1[12]; + u32 sw_reserved[12]; + }; + +} __attribute__((aligned(16))); + +struct i387_soft_struct { + u32 cwd; + u32 swd; + u32 twd; + u32 fip; + u32 fcs; + u32 foo; + u32 fos; + /* 8*10 bytes for each FP-reg = 80 bytes: */ + u32 st_space[20]; + u8 ftop; + u8 changed; + u8 lookahead; + u8 no_update; + u8 rm; + u8 alimit; + struct math_emu_info *info; + u32 entry_eip; +}; + +struct ymmh_struct { + /* 16 * 16 bytes for each YMMH-reg = 256 bytes */ + u32 ymmh_space[64]; +}; + +struct xsave_hdr_struct { + u64 xstate_bv; + u64 reserved1[2]; + u64 reserved2[5]; +} __attribute__((packed)); + +struct xsave_struct { + struct i387_fxsave_struct i387; + struct xsave_hdr_struct xsave_hdr; + struct ymmh_struct ymmh; + /* new processor state extensions will go here */ +} __attribute__ ((packed, aligned (64))); + +union thread_xstate { + struct i387_fsave_struct fsave; + struct i387_fxsave_struct fxsave; + struct i387_soft_struct soft; + struct xsave_struct xsave; +}; + +struct fpu { + unsigned int last_cpu; + unsigned int has_fpu; + union thread_xstate *state; +}; + +#ifdef CONFIG_X86_64 +#ifndef CONFIG_X86_NO_TSS +DECLARE_PER_CPU(struct orig_ist, orig_ist); +#endif + +union irq_stack_union { + char irq_stack[IRQ_STACK_SIZE]; + /* + * GCC hardcodes the stack canary as %gs:40. Since the + * irq_stack is the object at %gs:0, we reserve the bottom + * 48 bytes of the irq stack for the canary. + */ + struct { + char gs_base[40]; + unsigned long stack_canary; + }; +}; + +DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union); +DECLARE_INIT_PER_CPU(irq_stack_union); + +DECLARE_PER_CPU(char *, irq_stack_ptr); +DECLARE_PER_CPU(unsigned int, irq_count); +extern unsigned long kernel_eflags; +extern asmlinkage void ignore_sysret(void); +#else /* X86_64 */ +#ifdef CONFIG_CC_STACKPROTECTOR +/* + * Make sure stack canary segment base is cached-aligned: + * "For Intel Atom processors, avoid non zero segment base address + * that is not aligned to cache line boundary at all cost." + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) + */ +struct stack_canary { + char __pad[20]; /* canary at %gs:20 */ + unsigned long canary; +}; +DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +#endif +#endif /* X86_64 */ + +extern unsigned int xstate_size; +extern void free_thread_xstate(struct task_struct *); +extern struct kmem_cache *task_xstate_cachep; + +struct perf_event; + +struct thread_struct { + /* Cached TLS descriptors: */ + struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long sp0; + unsigned long sp; +#ifdef CONFIG_X86_32 + unsigned long sysenter_cs; +#else + unsigned short es; + unsigned short ds; + unsigned short fsindex; + unsigned short gsindex; +#endif +#ifdef CONFIG_X86_32 + unsigned long ip; +#endif +#ifdef CONFIG_X86_64 + unsigned long fs; +#endif + unsigned long gs; + /* Save middle states of ptrace breakpoints */ + struct perf_event *ptrace_bps[HBP_NUM]; + /* Debug status used for traps, single steps, etc... */ + unsigned long debugreg6; + /* Keep track of the exact dr7 value set by the user */ + unsigned long ptrace_dr7; + /* Fault info: */ + unsigned long cr2; + unsigned long trap_no; + unsigned long error_code; + /* floating point and extended processor state */ + struct fpu fpu; +#ifdef CONFIG_X86_32 + /* Virtual 86 mode info */ + struct vm86_struct __user *vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags, v86mask, saved_sp0; + unsigned int saved_fs, saved_gs; +#endif + /* IO permissions: */ + unsigned long *io_bitmap_ptr; + unsigned long iopl; + /* Max allowed port in the bitmap, in bytes: */ + unsigned io_bitmap_max; +}; + +static inline unsigned long xen_get_debugreg(int regno) +{ + return HYPERVISOR_get_debugreg(regno); +} + +static inline void xen_set_debugreg(int regno, unsigned long value) +{ + WARN_ON(HYPERVISOR_set_debugreg(regno, value)); +} + +/* + * Set IOPL bits in EFLAGS from given mask + */ +static inline void xen_set_iopl_mask(unsigned mask) +{ + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl)); +} + +#ifndef CONFIG_X86_NO_TSS +static inline void +native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +{ + tss->x86_tss.sp0 = thread->sp0; +#ifdef CONFIG_X86_32 + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { + tss->x86_tss.ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } +#endif +} +#else +#define xen_load_sp0(tss, thread) do { \ + if (HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->sp0)) \ + BUG(); \ +} while (0) +#endif + +#define __cpuid xen_cpuid +#define paravirt_enabled() 1 + +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = xen_get_debugreg(register) +#define set_debugreg(value, register) \ + xen_set_debugreg(register, value) + +#define load_sp0 xen_load_sp0 + +#define set_iopl_mask xen_set_iopl_mask + +/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up + * after us can get the correct flags. + */ +extern unsigned long mmu_cr4_features; + +static inline void set_in_cr4(unsigned long mask) +{ + unsigned long cr4; + + mmu_cr4_features |= mask; + cr4 = read_cr4(); + cr4 |= mask; + write_cr4(cr4); +} + +static inline void clear_in_cr4(unsigned long mask) +{ + unsigned long cr4; + + mmu_cr4_features &= ~mask; + cr4 = read_cr4(); + cr4 &= ~mask; + write_cr4(cr4); +} + +typedef struct { + unsigned long seg; +} mm_segment_t; + + +/* + * create a kernel thread without removing it from tasklists + */ +extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); + +/* Free all resources held by a thread. */ +extern void release_thread(struct task_struct *); + +/* Prepare to copy thread state - unlazy all lazy state */ +extern void prepare_to_copy(struct task_struct *tsk); + +unsigned long get_wchan(struct task_struct *p); + +/* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx + * resulting in stale register contents being returned. + */ +static inline void cpuid(unsigned int op, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = 0; + __cpuid(eax, ebx, ecx, edx); +} + +/* Some CPUID calls want 'count' to be placed in ecx */ +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +/* + * CPUID functions returning a single datum + */ +static inline unsigned int cpuid_eax(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return eax; +} + +static inline unsigned int cpuid_ebx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ebx; +} + +static inline unsigned int cpuid_ecx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return ecx; +} + +static inline unsigned int cpuid_edx(unsigned int op) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid(op, &eax, &ebx, &ecx, &edx); + + return edx; +} + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + asm volatile("rep; nop" ::: "memory"); +} + +static inline void cpu_relax(void) +{ + rep_nop(); +} + +/* Stop speculative execution and prefetching of modified code. */ +static inline void sync_core(void) +{ + int tmp; + +#if defined(CONFIG_M386) || defined(CONFIG_M486) + if (boot_cpu_data.x86 < 5) + /* There is no speculative execution. + * jmp is a barrier to prefetching. */ + asm volatile("jmp 1f\n1:\n" ::: "memory"); + else +#endif + /* cpuid is a barrier to speculative execution. + * Prefetched instructions are automatically + * invalidated when modified. */ + asm volatile("cpuid" : "=a" (tmp) : "0" (1) + : "ebx", "ecx", "edx", "memory"); +} + +static inline void __monitor(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitor %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + +static inline void __mwait(unsigned long eax, unsigned long ecx) +{ + /* "mwait %eax, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +static inline void __sti_mwait(unsigned long eax, unsigned long ecx) +{ + trace_hardirqs_on(); + /* "mwait %eax, %ecx;" */ + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); +} + +extern void select_idle_routine(const struct cpuinfo_x86 *c); +extern void init_amd_e400_c1e_mask(void); + +extern unsigned long boot_option_idle_override; +extern bool amd_e400_c1e_detected; + +enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, + IDLE_POLL, IDLE_FORCE_MWAIT}; + +extern void enable_sep_cpu(void); +extern int sysenter_setup(void); + +extern void early_trap_init(void); + +/* Defined in head.S */ +extern struct desc_ptr early_gdt_descr; + +extern void cpu_set_gdt(int); +extern void switch_to_new_gdt(int); +extern void load_percpu_segment(int); +extern void cpu_init(void); + +static inline unsigned long get_debugctlmsr(void) +{ + unsigned long debugctlmsr = 0; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); + + return debugctlmsr; +} + +static inline void update_debugctlmsr(unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); +} + +/* + * from system description table in BIOS. Mostly for MCA use, but + * others may find it useful: + */ +extern unsigned int machine_id; +extern unsigned int machine_submodel_id; +extern unsigned int BIOS_revision; + +/* Boot loader type from the setup header: */ +extern int bootloader_type; +extern int bootloader_version; + +extern char ignore_fpu_irq; + +#define HAVE_ARCH_PICK_MMAP_LAYOUT 1 +#define ARCH_HAS_PREFETCHW +#define ARCH_HAS_SPINLOCK_PREFETCH + +#ifdef CONFIG_X86_32 +# define BASE_PREFETCH ASM_NOP4 +# define ARCH_HAS_PREFETCH +#else +# define BASE_PREFETCH "prefetcht0 (%1)" +#endif + +/* + * Prefetch instructions for Pentium III (+) and AMD Athlon (+) + * + * It's not worth to care about 3dnow prefetches for the K6 + * because they are microcoded there and very slow. + */ +static inline void prefetch(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchnta (%1)", + X86_FEATURE_XMM, + "r" (x)); +} + +/* + * 3dnow prefetch to get an exclusive cache line. + * Useful for spinlocks to avoid one state transition in the + * cache coherency protocol: + */ +static inline void prefetchw(const void *x) +{ + alternative_input(BASE_PREFETCH, + "prefetchw (%1)", + X86_FEATURE_3DNOW, + "r" (x)); +} + +static inline void spin_lock_prefetch(const void *x) +{ + prefetchw(x); +} + +#ifdef CONFIG_X86_32 +/* + * User space process size: 3GB (default). + */ +#define TASK_SIZE PAGE_OFFSET +#define TASK_SIZE_MAX TASK_SIZE +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX STACK_TOP + +#define INIT_THREAD { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .vm86_info = NULL, \ + .sysenter_cs = __KERNEL_CS, \ + .io_bitmap_ptr = NULL, \ +} + +/* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ +#define INIT_TSS { \ + .x86_tss = { \ + .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .ss0 = __KERNEL_DS, \ + .ss1 = __KERNEL_CS, \ + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ + }, \ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ +} + +extern unsigned long thread_saved_pc(struct task_struct *tsk); + +#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) +#define KSTK_TOP(info) \ +({ \ + unsigned long *__ptr = (unsigned long *)(info); \ + (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ +}) + +/* + * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * This is necessary to guarantee that the entire "struct pt_regs" + * is accessible even if the CPU haven't stored the SS/ESP registers + * on the stack (interrupt gate does not save these registers + * when switching to the same priv ring). + * Therefore beware: accessing the ss/esp fields of the + * "struct pt_regs" is possible, but they may contain the + * completely wrong values. + */ +#define task_pt_regs(task) \ +({ \ + struct pt_regs *__regs__; \ + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ + __regs__ - 1; \ +}) + +#else +/* + * User space process size. 47bits minus one guard page. + */ +#define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ + 0xc0000000 : 0xFFFFe000) + +#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) +#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \ + IA32_PAGE_OFFSET : TASK_SIZE_MAX) + +#define STACK_TOP TASK_SIZE +#define STACK_TOP_MAX TASK_SIZE_MAX + +#define INIT_THREAD { \ + .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +#define INIT_TSS { \ + .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ +} + +/* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) + +#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) +#endif /* CONFIG_X86_64 */ + +extern void start_thread(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp); + +/* + * This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) + +#define KSTK_EIP(task) (task_pt_regs(task)->ip) +#define KSTK_ESP(task) (task_pt_regs(task)->sp) + +/* Get/set a process' ability to use the timestamp counter instruction */ +#define GET_TSC_CTL(adr) get_tsc_mode((adr)) +#define SET_TSC_CTL(val) set_tsc_mode((val)) + +extern int get_tsc_mode(unsigned long adr); +extern int set_tsc_mode(unsigned int val); + +extern int amd_get_nb_id(int cpu); + +struct aperfmperf { + u64 aperf, mperf; +}; + +static inline void get_aperfmperf(struct aperfmperf *am) +{ + WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_APERFMPERF)); + + rdmsrl(MSR_IA32_APERF, am->aperf); + rdmsrl(MSR_IA32_MPERF, am->mperf); +} + +#define APERFMPERF_SHIFT 10 + +static inline +unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, + struct aperfmperf *new) +{ + u64 aperf = new->aperf - old->aperf; + u64 mperf = new->mperf - old->mperf; + unsigned long ratio = aperf; + + mperf >>= APERFMPERF_SHIFT; + if (mperf) + ratio = div64_u64(aperf, mperf); + + return ratio; +} + +/* + * AMD errata checking + */ +#ifdef CONFIG_CPU_SUP_AMD +extern const int amd_erratum_383[]; +extern const int amd_erratum_400[]; +extern bool cpu_has_amd_erratum(const int *); + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +#else +#define cpu_has_amd_erratum(x) (false) +#endif /* CONFIG_CPU_SUP_AMD */ + +#endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/mach-xen/asm/setup.h b/arch/x86/include/mach-xen/asm/setup.h new file mode 100644 index 0000000..aaa418c --- /dev/null +++ b/arch/x86/include/mach-xen/asm/setup.h @@ -0,0 +1,21 @@ +#ifndef __ASSEMBLY__ + +void xen_start_kernel(void); +void xen_arch_setup(void); + +#ifdef CONFIG_X86_64 +void reserve_pfn_range(unsigned long pfn, unsigned long nr); +void reserve_pgtable_low(void); +#endif + +extern unsigned long xen_initrd_start; + +#ifdef CONFIG_EFI +void efi_probe(void); +#else +#define efi_probe() ((void)0) +#endif + +#endif + +#include_next diff --git a/arch/x86/include/mach-xen/asm/smp-processor-id.h b/arch/x86/include/mach-xen/asm/smp-processor-id.h new file mode 100644 index 0000000..c6c1ec5 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/smp-processor-id.h @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_SMP_PROCESSOR_ID_H +#define _ASM_X86_SMP_PROCESSOR_ID_H + +#if defined(CONFIG_SMP) && !defined(__ASSEMBLY__) + +#include + +DECLARE_PER_CPU(int, cpu_number); + +/* + * This function is needed by all SMP systems. It must _always_ be valid + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +#define raw_smp_processor_id() percpu_read(cpu_number) +#define safe_smp_processor_id() smp_processor_id() + +#ifdef CONFIG_X86_64_SMP +#define stack_smp_processor_id() \ +({ \ + struct thread_info *ti; \ + __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ + ti->cpu; \ +}) +#endif + +#ifdef CONFIG_DEBUG_PREEMPT +extern unsigned int debug_smp_processor_id(void); +# define smp_processor_id() debug_smp_processor_id() +#else +# define smp_processor_id() raw_smp_processor_id() +#endif + +#endif /* SMP && !__ASSEMBLY__ */ + +#endif /* _ASM_X86_SMP_PROCESSOR_ID_H */ diff --git a/arch/x86/include/mach-xen/asm/smp.h b/arch/x86/include/mach-xen/asm/smp.h new file mode 100644 index 0000000..76f78bf --- /dev/null +++ b/arch/x86/include/mach-xen/asm/smp.h @@ -0,0 +1,241 @@ +#ifndef _ASM_X86_SMP_H +#define _ASM_X86_SMP_H +#ifndef __ASSEMBLY__ +#include +#include +#include + +/* + * We need the APIC definitions automatically as part of 'smp.h' + */ +#ifdef CONFIG_X86_LOCAL_APIC +# include +# include +# ifdef CONFIG_X86_IO_APIC +# include +# endif +#endif +#include +#include +#include + +extern unsigned int num_processors; + +#ifndef CONFIG_XEN +static inline bool cpu_has_ht_siblings(void) +{ + bool has_siblings = false; +#ifdef CONFIG_SMP + has_siblings = cpu_has_ht && smp_num_siblings > 1; +#endif + return has_siblings; +} + +DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); +/* cpus sharing the last level cache: */ +DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); +DECLARE_PER_CPU(u16, cpu_llc_id); +DECLARE_PER_CPU(int, cpu_number); +#endif + +static inline const struct cpumask *cpu_sibling_mask(int cpu) +{ + return cpumask_of(cpu); +} + +static inline const struct cpumask *cpu_core_mask(int cpu) +{ + return cpumask_of(cpu); +} + +#ifndef CONFIG_XEN +static inline struct cpumask *cpu_llc_shared_mask(int cpu) +{ + return per_cpu(cpu_llc_shared_map, cpu); +} + +DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); +DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) +DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid); +#endif +#endif + +#ifdef CONFIG_SMP + +#ifndef CONFIG_XEN + +/* Static state in head.S used to set up a CPU */ +extern unsigned long stack_start; /* Initial stack pointer address */ + +struct smp_ops { + void (*smp_prepare_boot_cpu)(void); + void (*smp_prepare_cpus)(unsigned max_cpus); + void (*smp_cpus_done)(unsigned max_cpus); + + void (*stop_other_cpus)(int wait); + void (*smp_send_reschedule)(int cpu); + + int (*cpu_up)(unsigned cpu); + int (*cpu_disable)(void); + void (*cpu_die)(unsigned int cpu); + void (*play_dead)(void); + + void (*send_call_func_ipi)(const struct cpumask *mask); + void (*send_call_func_single_ipi)(int cpu); +}; + +/* Globals due to paravirt */ +extern void set_cpu_sibling_map(int cpu); + +extern struct smp_ops smp_ops; + +static inline void smp_send_stop(void) +{ + smp_ops.stop_other_cpus(0); +} + +static inline void stop_other_cpus(void) +{ + smp_ops.stop_other_cpus(1); +} + +static inline void smp_prepare_boot_cpu(void) +{ + smp_ops.smp_prepare_boot_cpu(); +} + +static inline void smp_prepare_cpus(unsigned int max_cpus) +{ + smp_ops.smp_prepare_cpus(max_cpus); +} + +static inline void smp_cpus_done(unsigned int max_cpus) +{ + smp_ops.smp_cpus_done(max_cpus); +} + +static inline int __cpu_up(unsigned int cpu) +{ + return smp_ops.cpu_up(cpu); +} + +static inline int __cpu_disable(void) +{ + return smp_ops.cpu_disable(); +} + +static inline void __cpu_die(unsigned int cpu) +{ + smp_ops.cpu_die(cpu); +} + +static inline void play_dead(void) +{ + smp_ops.play_dead(); +} + +static inline void smp_send_reschedule(int cpu) +{ + smp_ops.smp_send_reschedule(cpu); +} + +static inline void arch_send_call_function_single_ipi(int cpu) +{ + smp_ops.send_call_func_single_ipi(cpu); +} + +static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) +{ + smp_ops.send_call_func_ipi(mask); +} + +void cpu_disable_common(void); +void native_smp_prepare_boot_cpu(void); +void native_smp_prepare_cpus(unsigned int max_cpus); +void native_smp_cpus_done(unsigned int max_cpus); +int native_cpu_up(unsigned int cpunum); +int native_cpu_disable(void); +void native_cpu_die(unsigned int cpu); +void native_play_dead(void); +void play_dead_common(void); +void wbinvd_on_cpu(int cpu); +int wbinvd_on_all_cpus(void); + +void smp_store_cpu_info(int id); +#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) + +#else /* CONFIG_XEN */ + +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); +void xen_stop_other_cpus(int wait); +void xen_smp_send_reschedule(int cpu); +void xen_send_call_func_ipi(const struct cpumask *mask); +void xen_send_call_func_single_ipi(int cpu); + +static inline void smp_send_stop(void) +{ + xen_stop_other_cpus(0); +} + +#define smp_send_reschedule xen_smp_send_reschedule +#define arch_send_call_function_single_ipi xen_send_call_func_single_ipi +#define arch_send_call_function_ipi_mask xen_send_call_func_ipi + +void play_dead(void); + +#endif /* CONFIG_XEN */ + +/* We don't mark CPUs online until __cpu_up(), so we need another measure */ +static inline int num_booting_cpus(void) +{ + return cpumask_weight(cpu_callout_mask); +} +#elif /* !CONFIG_SMP && */ !defined(CONFIG_XEN) +#define wbinvd_on_cpu(cpu) wbinvd() +static inline int wbinvd_on_all_cpus(void) +{ + wbinvd(); + return 0; +} +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_XEN +int wbinvd_on_all_cpus(void); +#endif + +extern unsigned disabled_cpus __cpuinitdata; + +#include + +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) + +#ifndef CONFIG_X86_64 +static inline int logical_smp_processor_id(void) +{ + /* we don't want to mark this access volatile - bad code generation */ + return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); +} + +#endif + +extern int hard_smp_processor_id(void); + +#else /* CONFIG_X86_LOCAL_APIC */ + +# ifndef CONFIG_SMP +# define hard_smp_processor_id() 0 +# endif + +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_DEBUG_NMI_SELFTEST +extern void nmi_selftest(void); +#else +#define nmi_selftest() do { } while (0) +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/include/mach-xen/asm/spinlock.h b/arch/x86/include/mach-xen/asm/spinlock.h new file mode 100644 index 0000000..af304bb --- /dev/null +++ b/arch/x86/include/mach-xen/asm/spinlock.h @@ -0,0 +1,379 @@ +#ifndef _ASM_X86_SPINLOCK_H +#define _ASM_X86_SPINLOCK_H + +#include +#include +#include +#include + +/* + * Your basic SMP spinlocks, allowing only a single CPU anywhere + * + * Simple spin lock operations. There are two variants, one clears IRQ's + * on the local processor, one does not. + * + * These are fair FIFO ticket locks, which are currently limited to 256 + * CPUs. + * + * (the type definitions are in asm/spinlock_types.h) + */ + +#ifdef CONFIG_X86_32 +# define LOCK_PTR_REG "a" +# define REG_PTR_MODE "k" +#else +# define LOCK_PTR_REG "D" +# define REG_PTR_MODE "q" +#endif + +#if defined(CONFIG_XEN) || (defined(CONFIG_X86_32) && \ + (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))) +/* + * On Xen, as we read back the result of the unlocking increment, we must use + * a locked access (or insert a full memory barrier) in all cases (so that we + * read what is globally visible). + * + * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock + * (PPro errata 66, 92) + */ +# define UNLOCK_LOCK_PREFIX LOCK_PREFIX +#else +# define UNLOCK_LOCK_PREFIX +#endif + +#ifdef TICKET_SHIFT + +#include +#include + +int xen_spinlock_init(unsigned int cpu); +void xen_spinlock_cleanup(unsigned int cpu); +#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING +struct __raw_tickets xen_spin_adjust(const arch_spinlock_t *, + struct __raw_tickets); +#else +#define xen_spin_adjust(lock, raw_tickets) (raw_tickets) +#define xen_spin_wait(l, t, f) xen_spin_wait(l, t) +#endif +unsigned int xen_spin_wait(arch_spinlock_t *, struct __raw_tickets *, + unsigned int flags); +void xen_spin_kick(const arch_spinlock_t *, unsigned int ticket); + +/* + * Ticket locks are conceptually two parts, one indicating the current head of + * the queue, and the other indicating the current tail. The lock is acquired + * by atomically noting the tail and incrementing it by one (thus adding + * ourself to the queue and noting our position), then waiting until the head + * becomes equal to the the initial value of the tail. + * + * We use an xadd covering *both* parts of the lock, to increment the tail and + * also load the position of the head, which takes care of memory ordering + * issues and should be optimal for the uncontended case. Note the tail must be + * in the high part, because a wide xadd increment of the low part would carry + * up and contaminate the high part. + */ +#define __spin_count_dec(c, l) (vcpu_running((l)->owner) ? --(c) : ((c) >>= 1)) + +#if CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING +static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) +{ + struct __raw_tickets inc = { .tail = 1 }; + unsigned int count, flags = arch_local_irq_save(); + + inc = xadd(&lock->tickets, inc); + if (likely(inc.head == inc.tail)) + arch_local_irq_restore(flags); + else { + inc = xen_spin_adjust(lock, inc); + arch_local_irq_restore(flags); + count = 1 << 12; + do { + while (inc.head != inc.tail + && __spin_count_dec(count, lock)) { + cpu_relax(); + inc.head = ACCESS_ONCE(lock->tickets.head); + } + } while (unlikely(!count) + && (count = xen_spin_wait(lock, &inc, flags))); + } + barrier(); /* make sure nothing creeps before the lock is taken */ + lock->owner = raw_smp_processor_id(); +} +#else +#define __ticket_spin_lock(lock) __ticket_spin_lock_flags(lock, -1) +#endif + +static __always_inline void __ticket_spin_lock_flags(arch_spinlock_t *lock, + unsigned long flags) +{ + struct __raw_tickets inc = { .tail = 1 }; + + inc = xadd(&lock->tickets, inc); + if (unlikely(inc.head != inc.tail)) { + unsigned int count = 1 << 12; + + inc = xen_spin_adjust(lock, inc); + do { + while (inc.head != inc.tail + && __spin_count_dec(count, lock)) { + cpu_relax(); + inc.head = ACCESS_ONCE(lock->tickets.head); + } + } while (unlikely(!count) + && (count = xen_spin_wait(lock, &inc, flags))); + } + barrier(); /* make sure nothing creeps before the lock is taken */ + lock->owner = raw_smp_processor_id(); +} + +#undef __spin_count_dec + +static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) +{ + arch_spinlock_t old; + + old.tickets = ACCESS_ONCE(lock->tickets); + if (old.tickets.head != old.tickets.tail) + return 0; + + /* cmpxchg is a full barrier, so nothing can move before it */ + if (cmpxchg(&lock->head_tail, old.head_tail, + old.head_tail + (1 << TICKET_SHIFT)) != old.head_tail) + return 0; + lock->owner = raw_smp_processor_id(); + return 1; +} + +static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) +{ + register struct __raw_tickets new; + + __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); +#if !defined(XEN_SPINLOCK_SOURCE) || !CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING +# undef UNLOCK_LOCK_PREFIX +#endif + new = ACCESS_ONCE(lock->tickets); + if (new.head != new.tail) + xen_spin_kick(lock, new.head); +} + +static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) +{ + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + + return tmp.tail != tmp.head; +} + +static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) +{ + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + + return (__ticket_t)(tmp.tail - tmp.head) > 1; +} + +#define __arch_spin(n) __ticket_spin_##n + +#else /* TICKET_SHIFT */ + +static inline int xen_spinlock_init(unsigned int cpu) { return 0; } +static inline void xen_spinlock_cleanup(unsigned int cpu) {} + +static inline int __byte_spin_is_locked(arch_spinlock_t *lock) +{ + return lock->lock != 0; +} + +static inline int __byte_spin_is_contended(arch_spinlock_t *lock) +{ + return lock->spinners != 0; +} + +static inline void __byte_spin_lock(arch_spinlock_t *lock) +{ + s8 val = 1; + + asm("1: xchgb %1, %0\n" + " test %1,%1\n" + " jz 3f\n" + " " LOCK_PREFIX "incb %2\n" + "2: rep;nop\n" + " cmpb $1, %0\n" + " je 2b\n" + " " LOCK_PREFIX "decb %2\n" + " jmp 1b\n" + "3:" + : "+m" (lock->lock), "+q" (val), "+m" (lock->spinners): : "memory"); +} + +#define __byte_spin_lock_flags(lock, flags) __byte_spin_lock(lock) + +static inline int __byte_spin_trylock(arch_spinlock_t *lock) +{ + u8 old = 1; + + asm("xchgb %1,%0" + : "+m" (lock->lock), "+q" (old) : : "memory"); + + return old == 0; +} + +static inline void __byte_spin_unlock(arch_spinlock_t *lock) +{ + smp_wmb(); + lock->lock = 0; +} + +#define __arch_spin(n) __byte_spin_##n + +#endif /* TICKET_SHIFT */ + +#if defined(CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING) \ + && CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING +void xen_spin_irq_enter(void); +void xen_spin_irq_exit(void); +#else +static inline void xen_spin_irq_enter(void) {} +static inline void xen_spin_irq_exit(void) {} +#endif + +static inline int arch_spin_is_locked(arch_spinlock_t *lock) +{ + return __arch_spin(is_locked)(lock); +} + +static inline int arch_spin_is_contended(arch_spinlock_t *lock) +{ + return __arch_spin(is_contended)(lock); +} +#define arch_spin_is_contended arch_spin_is_contended + +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) +{ + __arch_spin(lock)(lock); +} + +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) +{ + return __arch_spin(trylock)(lock); +} + +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) +{ + __arch_spin(unlock)(lock); +} + +static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, + unsigned long flags) +{ + __arch_spin(lock_flags)(lock, flags); +} + +#undef __arch_spin + +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) +{ + while (arch_spin_is_locked(lock)) + cpu_relax(); +} + +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + * + * On x86, we implement read-write locks as a 32-bit counter + * with the high bit (sign) being the "contended" bit. + */ + +/** + * read_can_lock - would read_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int arch_read_can_lock(arch_rwlock_t *lock) +{ + return lock->lock > 0; +} + +/** + * write_can_lock - would write_trylock() succeed? + * @lock: the rwlock in question. + */ +static inline int arch_write_can_lock(arch_rwlock_t *lock) +{ + return lock->write == WRITE_LOCK_CMP; +} + +static inline void arch_read_lock(arch_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t" + "jns 1f\n" + "call __read_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (rw) : "memory"); +} + +static inline void arch_write_lock(arch_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t" + "jz 1f\n" + "call __write_lock_failed\n\t" + "1:\n" + ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS) + : "memory"); +} + +static inline int arch_read_trylock(arch_rwlock_t *lock) +{ + READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock; + + if (READ_LOCK_ATOMIC(dec_return)(count) >= 0) + return 1; + READ_LOCK_ATOMIC(inc)(count); + return 0; +} + +static inline int arch_write_trylock(arch_rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)&lock->write; + + if (atomic_sub_and_test(WRITE_LOCK_CMP, count)) + return 1; + atomic_add(WRITE_LOCK_CMP, count); + return 0; +} + +static inline void arch_read_unlock(arch_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0" + :"+m" (rw->lock) : : "memory"); +} + +static inline void arch_write_unlock(arch_rwlock_t *rw) +{ + asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0" + : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory"); +} + +#define arch_read_lock_flags(lock, flags) arch_read_lock(lock) +#define arch_write_lock_flags(lock, flags) arch_write_lock(lock) + +#undef READ_LOCK_SIZE +#undef READ_LOCK_ATOMIC +#undef WRITE_LOCK_ADD +#undef WRITE_LOCK_SUB +#undef WRITE_LOCK_CMP + +#define arch_spin_relax(lock) cpu_relax() +#define arch_read_relax(lock) cpu_relax() +#define arch_write_relax(lock) cpu_relax() + +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + +#endif /* _ASM_X86_SPINLOCK_H */ diff --git a/arch/x86/include/mach-xen/asm/spinlock_types.h b/arch/x86/include/mach-xen/asm/spinlock_types.h new file mode 100644 index 0000000..d78bbc0 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/spinlock_types.h @@ -0,0 +1,62 @@ +#ifndef _ASM_X86_SPINLOCK_TYPES_H +#define _ASM_X86_SPINLOCK_TYPES_H + +#ifndef __LINUX_SPINLOCK_TYPES_H +# error "please don't include this file directly" +#endif + +#include + +#ifdef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING +/* + * On Xen we support CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING levels of + * interrupt re-enabling per IRQ-safe lock. Hence we can have + * (CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING + 1) times as many outstanding + * tickets. Thus the cut-off for using byte register pairs must be at + * a sufficiently smaller number of CPUs. + */ +#if (CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING + 1) * CONFIG_NR_CPUS < 256 +typedef u8 __ticket_t; +# define TICKET_SHIFT 8 +typedef u16 __ticketpair_t; +#else +typedef u16 __ticket_t; +# define TICKET_SHIFT 16 +typedef u32 __ticketpair_t; +#endif + +typedef union { + __ticketpair_t head_tail; + struct { + struct __raw_tickets { + __ticket_t head, tail; + } tickets; +#if CONFIG_NR_CPUS <= 256 + u8 owner; +#else + u16 owner; +#endif + }; +#else /* ndef CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING */ +typedef struct { +/* + * This differs from the pre-2.6.24 spinlock by always using xchgb + * rather than decb to take the lock; this allows it to use a + * zero-initialized lock structure. It also maintains a 1-byte + * contention counter, so that we can implement + * __byte_spin_is_contended. + */ + u8 lock; +#if CONFIG_NR_CPUS < 256 + u8 spinners; +#else +# error NR_CPUS >= 256 not implemented +#endif +#endif /* def CONFIG_XEN_SPINLOCK_ACQUIRE_NESTING */ +} arch_spinlock_t; + +#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } + +#include + +#endif /* _ASM_X86_SPINLOCK_TYPES_H */ diff --git a/arch/x86/include/mach-xen/asm/swiotlb.h b/arch/x86/include/mach-xen/asm/swiotlb.h new file mode 100644 index 0000000..e82aad1 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/swiotlb.h @@ -0,0 +1,8 @@ +#include_next + +#ifndef CONFIG_SWIOTLB +#define swiotlb_init(verbose) ((void)(verbose)) +#endif + +dma_addr_t swiotlb_map_single_phys(struct device *, phys_addr_t, size_t size, + int dir); diff --git a/arch/x86/include/mach-xen/asm/system.h b/arch/x86/include/mach-xen/asm/system.h new file mode 100644 index 0000000..09ef84b --- /dev/null +++ b/arch/x86/include/mach-xen/asm/system.h @@ -0,0 +1,520 @@ +#ifndef _ASM_X86_SYSTEM_H +#define _ASM_X86_SYSTEM_H + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* entries in ARCH_DLINFO: */ +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) +# define AT_VECTOR_SIZE_ARCH 2 +#else /* else it's non-compat x86-64 */ +# define AT_VECTOR_SIZE_ARCH 1 +#endif + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p); +extern void show_regs_common(void); + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (stack_canary.canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + __switch_canary_oparam \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ + __switch_canary_iparam \ + \ + : /* reloaded segment registers */ \ + "memory"); \ +} while (0) + +#ifndef CONFIG_XEN +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#endif +#else + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movq %P[task_canary](%%rsi),%%r8\n\t" \ + "movq %%r8,"__percpu_arg([gs_canary])"\n\t" +#define __switch_canary_oparam \ + , [gs_canary] "=m" (irq_stack_union.stack_canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* The stack unwind code needs this but it pollutes traces otherwise */ +#ifdef CONFIG_UNWIND_INFO +#define THREAD_RETURN_SYM \ + ".globl thread_return\n" \ + "thread_return:\n\t" +#else +#define THREAD_RETURN_SYM +#endif + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + THREAD_RETURN_SYM \ + "movq "__percpu_arg([current_task])",%%rsi\n\t" \ + __switch_canary \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "jnz ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + __switch_canary_oparam \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [_tif_fork] "i" (_TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [current_task] "m" (current_task) \ + __switch_canary_iparam \ + : "memory", "cc" __EXTRA_CLOBBER) +#endif + +#ifdef __KERNEL__ + +extern void xen_load_gs_index(unsigned); + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ +do { \ + unsigned short __val = (value); \ + \ + asm volatile(" \n" \ + "1: movl %k0,%%" #seg " \n" \ + \ + ".section .fixup,\"ax\" \n" \ + "2: xorl %k0,%k0 \n" \ + " jmp 1b \n" \ + ".previous \n" \ + \ + _ASM_EXTABLE(1b, 2b) \ + \ + : "+r" (__val) : : "memory"); \ +} while (0) + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; +} + +static inline void xen_clts(void) +{ + HYPERVISOR_fpu_taskswitch(0); +} + +static inline void xen_stts(void) +{ + HYPERVISOR_fpu_taskswitch(1); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long xen_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void xen_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); +} + +#define xen_read_cr2() vcpu_info_read(arch.cr2) +#define xen_write_cr2(val) vcpu_info_write(arch.cr2, val) + +static inline unsigned long xen_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); +#ifdef CONFIG_X86_32 + return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; +#else + return machine_to_phys(val); +#endif +} + +static inline void xen_write_cr3(unsigned long val) +{ +#ifdef CONFIG_X86_32 + val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); +#else + val = phys_to_machine(val); +#endif + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long xen_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +#define xen_read_cr4_safe() xen_read_cr4() + +static inline void xen_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long xen_read_cr8(void) +{ + return 0; +} + +static inline void xen_write_cr8(unsigned long val) +{ + BUG_ON(val); +} +#endif + +static inline void xen_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +static inline unsigned long read_cr0(void) +{ + return xen_read_cr0(); +} + +static inline void write_cr0(unsigned long x) +{ + xen_write_cr0(x); +} + +static inline unsigned long read_cr2(void) +{ + return xen_read_cr2(); +} + +static inline void write_cr2(unsigned long x) +{ + xen_write_cr2(x); +} + +static inline unsigned long read_cr3(void) +{ + return xen_read_cr3(); +} + +static inline void write_cr3(unsigned long x) +{ + xen_write_cr3(x); +} + +static inline unsigned long read_cr4(void) +{ + return xen_read_cr4(); +} + +static inline unsigned long read_cr4_safe(void) +{ + return xen_read_cr4_safe(); +} + +static inline void write_cr4(unsigned long x) +{ + xen_write_cr4(x); +} + +static inline void wbinvd(void) +{ + xen_wbinvd(); +} + +#ifdef CONFIG_X86_64 + +static inline unsigned long read_cr8(void) +{ + return xen_read_cr8(); +} + +static inline void write_cr8(unsigned long x) +{ + xen_write_cr8(x); +} + +static inline void load_gs_index(unsigned selector) +{ + xen_load_gs_index(selector); +} + +#endif + +/* Clear the 'TS' bit */ +static inline void clts(void) +{ + xen_clts(); +} + +static inline void stts(void) +{ + xen_stts(); +} + +#endif /* __KERNEL__ */ + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() asm volatile ("nop") + +void disable_hlt(void); +void enable_hlt(void); + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void xen_idle(void); +bool set_pm_idle_to_default(void); + +void stop_this_cpu(void *dummy); + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static __always_inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +/* + * We handle most unaligned accesses in hardware. On the other hand + * unaligned DMA can be quite expensive on some Nehalem processors. + * + * Based on this we disable the IP header alignment in network drivers. + */ +#define NET_IP_ALIGN 0 +#endif /* _ASM_X86_SYSTEM_H */ diff --git a/arch/x86/include/mach-xen/asm/time.h b/arch/x86/include/mach-xen/asm/time.h new file mode 100644 index 0000000..d898756 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/time.h @@ -0,0 +1,18 @@ +#ifndef _XEN_ASM_TIME_H +#define _XEN_ASM_TIME_H + +unsigned long xen_read_wallclock(void); +int xen_write_wallclock(unsigned long); + +struct timespec; +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +int xen_update_wallclock(const struct timespec *); +#else +static inline int xen_update_wallclock(const struct timespec *tv) { + return -EPERM; +} +#endif + +#endif /* _XEN_ASM_TIME_H */ + +#include_next diff --git a/arch/x86/include/mach-xen/asm/tlbflush.h b/arch/x86/include/mach-xen/asm/tlbflush.h new file mode 100644 index 0000000..0dc6dd6 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/tlbflush.h @@ -0,0 +1,114 @@ +#ifndef _ASM_X86_TLBFLUSH_H +#define _ASM_X86_TLBFLUSH_H + +#include +#include + +#include +#include + +#define __flush_tlb() xen_tlb_flush() +#define __flush_tlb_global() xen_tlb_flush() +#define __flush_tlb_single(addr) xen_invlpg(addr) +#define __flush_tlb_all() xen_tlb_flush() +#define __flush_tlb_one(addr) xen_invlpg(addr) + +#ifdef CONFIG_X86_32 +# define TLB_FLUSH_ALL 0xffffffff +#else +# define TLB_FLUSH_ALL -1ULL +#endif + +/* + * TLB flushing: + * + * - flush_tlb() flushes the current mm struct TLBs + * - flush_tlb_all() flushes all processes TLBs + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages + * + * ..but the i386 has somewhat limited tlb flushing capabilities, + * and page-granular flushes are available only on i486 and up. + * + * x86-64 can only flush individual pages or full VMs. For a range flush + * we always do the full VM. Might be worth trying if for a small + * range a few INVLPGs in a row are a win. + */ + +#ifndef CONFIG_SMP + +#define flush_tlb() __flush_tlb() +#define flush_tlb_all() __flush_tlb_all() +#define local_flush_tlb() __flush_tlb() + +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (mm == current->active_mm) + __flush_tlb(); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long addr) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb_one(addr); +} + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (vma->vm_mm == current->active_mm) + __flush_tlb(); +} + +static inline void reset_lazy_tlbstate(void) +{ +} + +#else /* SMP */ + +#include + +#define local_flush_tlb() __flush_tlb() + +#define flush_tlb_all xen_tlb_flush_all +#define flush_tlb_current_task() xen_tlb_flush_mask(mm_cpumask(current->mm)) +#define flush_tlb_mm(mm) xen_tlb_flush_mask(mm_cpumask(mm)) +#define flush_tlb_page(vma, va) xen_invlpg_mask(mm_cpumask((vma)->vm_mm), va) + +#define flush_tlb() flush_tlb_current_task() + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + flush_tlb_mm(vma->vm_mm); +} + +#ifndef CONFIG_XEN +#define TLBSTATE_OK 1 +#define TLBSTATE_LAZY 2 + +struct tlb_state { + struct mm_struct *active_mm; + int state; +}; +DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); + +static inline void reset_lazy_tlbstate(void) +{ + percpu_write(cpu_tlbstate.state, 0); + percpu_write(cpu_tlbstate.active_mm, &init_mm); +} +#endif + +#endif /* SMP */ + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + flush_tlb_all(); +} + +#endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/mach-xen/asm/vga.h b/arch/x86/include/mach-xen/asm/vga.h new file mode 100644 index 0000000..fe4a3c45 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/vga.h @@ -0,0 +1,20 @@ +/* + * Access to VGA videoram + * + * (c) 1998 Martin Mares + */ + +#ifndef _ASM_X86_VGA_H +#define _ASM_X86_VGA_H + +/* + * On the PC, we can just recalculate addresses and then + * access the videoram directly without any black magic. + */ + +#define VGA_MAP_MEM(x, s) (unsigned long)isa_bus_to_virt(x) + +#define vga_readb(x) (*(x)) +#define vga_writeb(x, y) (*(y) = (x)) + +#endif /* _ASM_X86_VGA_H */ diff --git a/arch/x86/include/mach-xen/asm/xenoprof.h b/arch/x86/include/mach-xen/asm/xenoprof.h new file mode 100644 index 0000000..2733e00 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/xenoprof.h @@ -0,0 +1,48 @@ +/****************************************************************************** + * asm-i386/mach-xen/asm/xenoprof.h + * + * Copyright (c) 2006 Isaku Yamahata + * VA Linux Systems Japan K.K. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef __ASM_XENOPROF_H__ +#define __ASM_XENOPROF_H__ +#ifdef CONFIG_XEN + +struct super_block; +struct dentry; +int xenoprof_create_files(struct super_block * sb, struct dentry * root); +#define HAVE_XENOPROF_CREATE_FILES + +struct xenoprof_init; +void xenoprof_arch_init_counter(struct xenoprof_init *init); +void xenoprof_arch_counter(void); +void xenoprof_arch_start(void); +void xenoprof_arch_stop(void); + +struct xenoprof_arch_shared_buffer { + /* nothing */ +}; +struct xenoprof_shared_buffer; +void xenoprof_arch_unmap_shared_buffer(struct xenoprof_shared_buffer* sbuf); +struct xenoprof_get_buffer; +int xenoprof_arch_map_shared_buffer(struct xenoprof_get_buffer* get_buffer, struct xenoprof_shared_buffer* sbuf); +struct xenoprof_passive; +int xenoprof_arch_set_passive(struct xenoprof_passive* pdomain, struct xenoprof_shared_buffer* sbuf); + +#endif /* CONFIG_XEN */ +#endif /* __ASM_XENOPROF_H__ */ diff --git a/arch/x86/include/mach-xen/asm/xor.h b/arch/x86/include/mach-xen/asm/xor.h new file mode 100644 index 0000000..edb08e6 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/xor.h @@ -0,0 +1,8 @@ +#ifdef CONFIG_KMEMCHECK +/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ +# include +#elif defined(CONFIG_X86_32) +# include "../../asm/xor_32.h" +#else +# include "xor_64.h" +#endif diff --git a/arch/x86/include/mach-xen/asm/xor_64.h b/arch/x86/include/mach-xen/asm/xor_64.h new file mode 100644 index 0000000..7d05c24 --- /dev/null +++ b/arch/x86/include/mach-xen/asm/xor_64.h @@ -0,0 +1,339 @@ +#ifndef _ASM_X86_XOR_64_H +#define _ASM_X86_XOR_64_H + +#include + +/* + * x86-64 changes / gcc fixes from Andi Kleen. + * Copyright 2002 Andi Kleen, SuSE Labs. + * + * This hasn't been optimized for the hammer yet, but there are likely + * no advantages to be gotten from x86-64 here anyways. + */ + +typedef struct { + unsigned long a, b; +} __attribute__((aligned(16))) xmm_store_t; + +/* Doesn't use gcc to save the XMM registers, because there is no easy way to + tell it to do a clts before the register saving. */ +#define XMMS_SAVE \ +do { \ + preempt_disable(); \ + if (!__thread_has_fpu(current)) \ + clts(); \ + asm volatile( \ + "movups %%xmm0,(%1) ;\n\t" \ + "movups %%xmm1,0x10(%1) ;\n\t" \ + "movups %%xmm2,0x20(%1) ;\n\t" \ + "movups %%xmm3,0x30(%1) ;\n\t" \ + : "=&r" (cr0) \ + : "r" (xmm_save) \ + : "memory"); \ +} while (0) + +#define XMMS_RESTORE \ +do { \ + asm volatile( \ + "sfence ;\n\t" \ + "movups (%1),%%xmm0 ;\n\t" \ + "movups 0x10(%1),%%xmm1 ;\n\t" \ + "movups 0x20(%1),%%xmm2 ;\n\t" \ + "movups 0x30(%1),%%xmm3 ;\n\t" \ + : \ + : "r" (cr0), "r" (xmm_save) \ + : "memory"); \ + if (!__thread_has_fpu(current)) \ + stts(); \ + preempt_enable(); \ +} while (0) + +#define OFFS(x) "16*("#x")" +#define PF_OFFS(x) "256+16*("#x")" +#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" +#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" +#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" +#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" +#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" +#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" +#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" +#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" +#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" +#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" +#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" +#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" +#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" + + +static void +xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) +{ + unsigned int lines = bytes >> 8; + unsigned long cr0; + xmm_store_t xmm_save[4]; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + LD(i, 0) \ + LD(i + 1, 1) \ + PF1(i) \ + PF1(i + 2) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " decl %[cnt] ; jnz 1b" + : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static void +xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+r" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) + : [inc] "r" (256UL) + : "memory"); + XMMS_RESTORE; +} + +static void +xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) + : [inc] "r" (256UL) + : "memory" ); + + XMMS_RESTORE; +} + +static void +xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, + unsigned long *p3, unsigned long *p4, unsigned long *p5) +{ + unsigned int lines = bytes >> 8; + xmm_store_t xmm_save[4]; + unsigned long cr0; + + XMMS_SAVE; + + asm volatile( +#undef BLOCK +#define BLOCK(i) \ + PF1(i) \ + PF1(i + 2) \ + LD(i, 0) \ + LD(i + 1, 1) \ + LD(i + 2, 2) \ + LD(i + 3, 3) \ + PF2(i) \ + PF2(i + 2) \ + XO1(i, 0) \ + XO1(i + 1, 1) \ + XO1(i + 2, 2) \ + XO1(i + 3, 3) \ + PF3(i) \ + PF3(i + 2) \ + XO2(i, 0) \ + XO2(i + 1, 1) \ + XO2(i + 2, 2) \ + XO2(i + 3, 3) \ + PF4(i) \ + PF4(i + 2) \ + PF0(i + 4) \ + PF0(i + 6) \ + XO3(i, 0) \ + XO3(i + 1, 1) \ + XO3(i + 2, 2) \ + XO3(i + 3, 3) \ + XO4(i, 0) \ + XO4(i + 1, 1) \ + XO4(i + 2, 2) \ + XO4(i + 3, 3) \ + ST(i, 0) \ + ST(i + 1, 1) \ + ST(i + 2, 2) \ + ST(i + 3, 3) \ + + + PF0(0) + PF0(2) + + " .align 32 ;\n" + " 1: ;\n" + + BLOCK(0) + BLOCK(4) + BLOCK(8) + BLOCK(12) + + " addq %[inc], %[p1] ;\n" + " addq %[inc], %[p2] ;\n" + " addq %[inc], %[p3] ;\n" + " addq %[inc], %[p4] ;\n" + " addq %[inc], %[p5] ;\n" + " decl %[cnt] ; jnz 1b" + : [cnt] "+c" (lines), + [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), + [p5] "+r" (p5) + : [inc] "r" (256UL) + : "memory"); + + XMMS_RESTORE; +} + +static struct xor_block_template xor_block_sse = { + .name = "generic_sse", + .do_2 = xor_sse_2, + .do_3 = xor_sse_3, + .do_4 = xor_sse_4, + .do_5 = xor_sse_5, +}; + +#undef XOR_TRY_TEMPLATES +#define XOR_TRY_TEMPLATES \ +do { \ + xor_speed(&xor_block_sse); \ +} while (0) + +/* We force the use of the SSE xor block because it can write around L2. + We may also be able to load into the L1 only depending on how the cpu + deals with a load to a line that is being prefetched. */ +#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) + +#endif /* _ASM_X86_XOR_64_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5369059..dadd24e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -101,6 +101,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o +obj-$(CONFIG_X86_XEN) += fixup.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) @@ -112,3 +114,8 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o endif + +disabled-obj-$(CONFIG_XEN) := crash.o early-quirks.o hpet.o i8237.o i8253.o \ + i8259.o irqinit.o pci-swiotlb.o reboot.o smpboot.o trampoline%.o \ + tsc.o tsc_sync.o vsmp_64.o +disabled-obj-$(CONFIG_XEN_UNPRIVILEGED_GUEST) += probe_roms.o diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 6f35260..528e3de 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -5,6 +5,9 @@ obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o +ifneq ($(CONFIG_PROCESSOR_EXTERNAL_CONTROL),) +obj-$(CONFIG_XEN) += processor_extcntl_xen.o +endif endif $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin @@ -12,3 +15,4 @@ $(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin $(obj)/realmode/wakeup.bin: FORCE $(Q)$(MAKE) $(build)=$(obj)/realmode +disabled-obj-$(CONFIG_XEN) := cstate.o sleep.o wakeup_%.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1b9b052..84cfb0d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -70,6 +70,7 @@ int acpi_strict; u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; +#ifndef CONFIG_XEN int acpi_skip_timer_override __initdata; int acpi_use_timer_override __initdata; int acpi_fix_pin2_polarity __initdata; @@ -77,6 +78,10 @@ int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif +#else +#define acpi_skip_timer_override 0 +#define acpi_fix_pin2_polarity 0 +#endif #ifndef __HAVE_ARCH_CMPXCHG #warning ACPI uses CMPXCHG, i486 and later hardware @@ -182,6 +187,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) return -ENODEV; } +#ifndef CONFIG_XEN if (madt->address) { acpi_lapic_addr = (u64) madt->address; @@ -191,12 +197,14 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) default_acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); +#endif return 0; } static void __cpuinit acpi_register_lapic(int id, u8 enabled) { +#ifndef CONFIG_XEN unsigned int ver = 0; if (id >= (MAX_LOCAL_APIC-1)) { @@ -213,6 +221,7 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) ver = apic_version[boot_cpu_physical_apicid]; generic_processor_info(id, ver); +#endif } static int __init @@ -297,6 +306,7 @@ static int __init acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, const unsigned long end) { +#ifndef CONFIG_XEN struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL; lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header; @@ -305,6 +315,7 @@ acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, return -EINVAL; acpi_lapic_addr = lapic_addr_ovr->address; +#endif return 0; } @@ -593,6 +604,7 @@ void __init acpi_set_irq_model_ioapic(void) #ifdef CONFIG_ACPI_HOTPLUG_CPU #include +#ifndef CONFIG_XEN static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { #ifdef CONFIG_ACPI_NUMA @@ -678,6 +690,9 @@ free_tmp_map: out: return retval; } +#else +#define _acpi_map_lsapic(h, p) (-EINVAL) +#endif /* wrapper to silence section mismatch warning */ int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu) @@ -688,9 +703,11 @@ EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { +#ifndef CONFIG_XEN per_cpu(x86_cpu_to_apicid, cpu) = -1; set_cpu_present(cpu, false); num_processors--; +#endif return (0); } @@ -1332,6 +1349,7 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) return 0; } +#ifndef CONFIG_XEN /* * Force ignoring BIOS IRQ0 pin2 override */ @@ -1349,6 +1367,7 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) } return 0; } +#endif static int __init force_acpi_rsdt(const struct dmi_system_id *d) { @@ -1469,6 +1488,7 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { {} }; +#ifndef CONFIG_XEN /* second table for DMI checks that should run after early-quirks */ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { /* @@ -1515,6 +1535,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { }, {} }; +#endif /* * acpi_boot_table_init() and acpi_boot_init() @@ -1587,8 +1608,10 @@ int __init early_acpi_boot_init(void) int __init acpi_boot_init(void) { +#ifndef CONFIG_XEN /* those are executed after early-quirks are executed */ dmi_check_system(acpi_dmi_table_late); +#endif /* * If acpi_disabled, bail out @@ -1688,7 +1711,7 @@ int __init acpi_mps_check(void) return 0; } -#ifdef CONFIG_X86_IO_APIC +#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_XEN) static int __init parse_acpi_skip_timer_override(char *arg) { acpi_skip_timer_override = 1; diff --git a/arch/x86/kernel/acpi/processor_extcntl_xen.c b/arch/x86/kernel/acpi/processor_extcntl_xen.c new file mode 100644 index 0000000..6293fac --- /dev/null +++ b/arch/x86/kernel/acpi/processor_extcntl_xen.c @@ -0,0 +1,287 @@ +/* + * processor_extcntl_xen.c - interface to notify Xen + * + * Copyright (C) 2008, Intel corporation + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int xen_cx_notifier(struct acpi_processor *pr, int action) +{ + int ret, count = 0, i; + xen_platform_op_t op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = pr->acpi_id, + .u.set_pminfo.type = XEN_PM_CX, + }; + struct xen_processor_cx *data, *buf; + struct acpi_processor_cx *cx; + + /* Convert to Xen defined structure and hypercall */ + buf = kzalloc(pr->power.count * sizeof(struct xen_processor_cx), + GFP_KERNEL); + if (!buf) + return -ENOMEM; + + data = buf; + for (i = 1; i <= pr->power.count; i++) { + cx = &pr->power.states[i]; + /* Skip invalid cstate entry */ + if (!cx->valid) + continue; + + data->type = cx->type; + data->latency = cx->latency; + data->power = cx->power; + data->reg.space_id = cx->reg.space_id; + data->reg.bit_width = cx->reg.bit_width; + data->reg.bit_offset = cx->reg.bit_offset; + data->reg.access_size = cx->reg.access_size; + data->reg.address = cx->reg.address; + + /* Get dependency relationships */ + if (cx->csd_count) { + pr_warning("_CSD found: Not supported for now!\n"); + kfree(buf); + return -EINVAL; + } else { + data->dpcnt = 0; + set_xen_guest_handle(data->dp, NULL); + } + + data++; + count++; + } + + if (!count) { + pr_info("No available Cx info for cpu %d\n", pr->acpi_id); + kfree(buf); + return -EINVAL; + } + + op.u.set_pminfo.u.power.count = count; + op.u.set_pminfo.u.power.flags.bm_control = pr->flags.bm_control; + op.u.set_pminfo.u.power.flags.bm_check = pr->flags.bm_check; + op.u.set_pminfo.u.power.flags.has_cst = pr->flags.has_cst; + op.u.set_pminfo.u.power.flags.power_setup_done = pr->flags.power_setup_done; + + set_xen_guest_handle(op.u.set_pminfo.u.power.states, buf); + ret = HYPERVISOR_platform_op(&op); + kfree(buf); + return ret; +} + +static int xen_px_notifier(struct acpi_processor *pr, int action) +{ + int ret = -EINVAL; + xen_platform_op_t op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = pr->acpi_id, + .u.set_pminfo.type = XEN_PM_PX, + }; + struct xen_processor_performance *perf; + struct xen_processor_px *states = NULL; + struct acpi_processor_performance *px; + struct acpi_psd_package *pdomain; + + if (!pr) + return -EINVAL; + + perf = &op.u.set_pminfo.u.perf; + px = pr->performance; + if (!px) + return -EINVAL; + + switch(action) { + case PROCESSOR_PM_CHANGE: + /* ppc dynamic handle */ + perf->flags = XEN_PX_PPC; + perf->platform_limit = pr->performance_platform_limit; + + ret = HYPERVISOR_platform_op(&op); + break; + + case PROCESSOR_PM_INIT: + /* px normal init */ + perf->flags = XEN_PX_PPC | + XEN_PX_PCT | + XEN_PX_PSS | + XEN_PX_PSD; + + /* ppc */ + perf->platform_limit = pr->performance_platform_limit; + + /* pct */ + xen_convert_pct_reg(&perf->control_register, &px->control_register); + xen_convert_pct_reg(&perf->status_register, &px->status_register); + + /* pss */ + perf->state_count = px->state_count; + states = kzalloc(px->state_count*sizeof(xen_processor_px_t),GFP_KERNEL); + if (!states) + return -ENOMEM; + xen_convert_pss_states(states, px->states, px->state_count); + set_xen_guest_handle(perf->states, states); + + /* psd */ + pdomain = &px->domain_info; + xen_convert_psd_pack(&perf->domain_info, pdomain); + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) + perf->shared_type = CPUFREQ_SHARED_TYPE_ALL; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) + perf->shared_type = CPUFREQ_SHARED_TYPE_ANY; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) + perf->shared_type = CPUFREQ_SHARED_TYPE_HW; + else { + ret = -ENODEV; + kfree(states); + break; + } + + ret = HYPERVISOR_platform_op(&op); + kfree(states); + break; + + default: + break; + } + + return ret; +} + +static int xen_tx_notifier(struct acpi_processor *pr, int action) +{ + return -EINVAL; +} + +static int xen_hotplug_notifier(struct acpi_processor *pr, int event) +{ + int ret = -EINVAL; +#ifdef CONFIG_ACPI_HOTPLUG_CPU + acpi_status status = 0; + acpi_object_type type; + uint32_t apic_id; + int device_decl = 0; + unsigned long long pxm; + xen_platform_op_t op; + + status = acpi_get_type(pr->handle, &type); + if (ACPI_FAILURE(status)) { + pr_warn("can't get object type for acpi_id %#x\n", + pr->acpi_id); + return -ENXIO; + } + + switch (type) { + case ACPI_TYPE_PROCESSOR: + break; + case ACPI_TYPE_DEVICE: + device_decl = 1; + break; + default: + pr_warn("unsupported object type %#x for acpi_id %#x\n", + type, pr->acpi_id); + return -EOPNOTSUPP; + } + + apic_id = acpi_get_cpuid(pr->handle, ~device_decl, pr->acpi_id); + if (apic_id < 0) { + pr_warn("can't get apic_id for acpi_id %#x\n", pr->acpi_id); + return -ENODATA; + } + + status = acpi_evaluate_integer(pr->handle, "_PXM", NULL, &pxm); + if (ACPI_FAILURE(status)) { + pr_warn("can't get pxm for acpi_id %#x\n", pr->acpi_id); + return -ENODATA; + } + + switch (event) { + case HOTPLUG_TYPE_ADD: + op.cmd = XENPF_cpu_hotadd; + op.u.cpu_add.apic_id = apic_id; + op.u.cpu_add.acpi_id = pr->acpi_id; + op.u.cpu_add.pxm = pxm; + ret = HYPERVISOR_platform_op(&op); + break; + case HOTPLUG_TYPE_REMOVE: + pr_warn("Xen doesn't support CPU hot remove\n"); + ret = -EOPNOTSUPP; + break; + } +#endif + + return ret; +} + +static struct processor_extcntl_ops xen_extcntl_ops = { + .hotplug = xen_hotplug_notifier, +}; + +static int __init init_extcntl(void) +{ + unsigned int pmbits = (xen_start_info->flags & SIF_PM_MASK) >> 8; + +#ifndef CONFIG_ACPI_HOTPLUG_CPU + if (!pmbits) + return 0; +#endif + if (pmbits & XEN_PROCESSOR_PM_CX) + xen_extcntl_ops.pm_ops[PM_TYPE_IDLE] = xen_cx_notifier; + if (pmbits & XEN_PROCESSOR_PM_PX) + xen_extcntl_ops.pm_ops[PM_TYPE_PERF] = xen_px_notifier; + if (pmbits & XEN_PROCESSOR_PM_TX) + xen_extcntl_ops.pm_ops[PM_TYPE_THR] = xen_tx_notifier; + + processor_extcntl_ops = &xen_extcntl_ops; + + return 0; +} +arch_initcall(init_extcntl); + +unsigned int cpufreq_quick_get(unsigned int cpu) +{ + xen_platform_op_t op; + + op.cmd = XENPF_get_cpu_freq; + op.u.get_cpu_freq.vcpu = cpu; + return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0; +} + +unsigned int cpufreq_quick_get_max(unsigned int cpu) +{ + xen_platform_op_t op; + + op.cmd = XENPF_get_cpu_freq_max; + op.u.get_cpu_freq.vcpu = cpu; + return HYPERVISOR_platform_op(&op) == 0 ? op.u.get_cpu_freq.freq : 0; +} +EXPORT_SYMBOL(cpufreq_quick_get_max); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index be16854..1bd8529 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,6 +15,10 @@ static u32 *flush_words; const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, +#ifdef CONFIG_XEN + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, /* Fam12, Fam14 */ +#endif { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, {} }; @@ -150,6 +154,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) return res; } +#ifndef CONFIG_XEN int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; @@ -204,6 +209,7 @@ int amd_set_subcaches(int cpu, int mask) return 0; } +#endif static int amd_cache_gart(void) { diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 0ae0323..f30b902 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -25,3 +25,7 @@ obj-$(CONFIG_X86_ES7000) += es7000_32.o # For 32bit, probe_32 need to be listed last obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o + +probe_64-$(CONFIG_XEN) := probe_32.o + +disabled-obj-$(CONFIG_XEN) := apic_%.o diff --git a/arch/x86/kernel/apic/apic-xen.c b/arch/x86/kernel/apic/apic-xen.c new file mode 100644 index 0000000..6b0603c --- /dev/null +++ b/arch/x86/kernel/apic/apic-xen.c @@ -0,0 +1,69 @@ +/* + * Local APIC handling stubs + */ + +#include +#include + +#include +#include +#include + +unsigned int num_processors; + +/* + * Debug level, exported for io_apic.c + */ +unsigned int apic_verbosity; + +/* Have we found an MP table */ +int smp_found_config; + +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + return 0; +#endif + return -EINVAL; + } + + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; + else { + pr_warning("APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic", apic_set_verbosity); + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +#ifndef CONFIG_SMP +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor(void) +{ +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +# ifdef CONFIG_X86_64 + else + nr_ioapics = 0; +# endif +#endif + + return 0; +} +#endif diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 31cb9ae..8773f2c 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -26,6 +26,10 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #endif #ifdef arch_trigger_all_cpu_backtrace +#ifdef CONFIG_XEN +#include +#endif + /* For reliability, we're prepared to waste bits here. */ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; @@ -46,7 +50,11 @@ void arch_trigger_all_cpu_backtrace(void) cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); printk(KERN_INFO "sending NMI to all CPUs:\n"); +#ifndef CONFIG_XEN apic->send_IPI_all(NMI_VECTOR); +#else /* this works even without CONFIG_X86_LOCAL_APIC */ + xen_send_IPI_all(NMI_VECTOR); +#endif /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { diff --git a/arch/x86/kernel/apic/io_apic-xen.c b/arch/x86/kernel/apic/io_apic-xen.c new file mode 100644 index 0000000..2784be7 --- /dev/null +++ b/arch/x86/kernel/apic/io_apic-xen.c @@ -0,0 +1,4199 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* time_after() */ +#include +#ifdef CONFIG_ACPI +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_XEN +#include +#include +#include + +/* Fake i8259 */ +static void make_8259A_irq(unsigned int irq) { io_apic_irqs &= ~(1UL<next) + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +static DEFINE_RAW_SPINLOCK(ioapic_lock); +#ifndef CONFIG_XEN +static DEFINE_RAW_SPINLOCK(vector_lock); +#endif + +static struct ioapic { + /* + * # of IRQ routing registers + */ + int nr_registers; +#ifndef CONFIG_XEN + /* + * Saved state during suspend/resume, or while enabling intr-remap. + */ + struct IO_APIC_route_entry *saved_registers; +#endif + /* I/O APIC config */ + struct mpc_ioapic mp_config; + /* IO APIC gsi routing info */ + struct mp_ioapic_gsi gsi_config; + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} ioapics[MAX_IO_APICS]; + +#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver + +int mpc_ioapic_id(int ioapic_idx) +{ + return ioapics[ioapic_idx].mp_config.apicid; +} + +unsigned int mpc_ioapic_addr(int ioapic_idx) +{ + return ioapics[ioapic_idx].mp_config.apicaddr; +} + +struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +{ + return &ioapics[ioapic_idx].gsi_config; +} + +int nr_ioapics; + +/* The one past the highest gsi number used */ +u32 gsi_top; + +/* MP IRQ source entries */ +struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +#ifndef CONFIG_XEN +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; +#endif + +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +int skip_ioapic_setup; + +/** + * disable_ioapic_support() - disables ioapic support at runtime + */ +static void __init _disable_ioapic_support(void) +{ +#ifdef CONFIG_PCI + noioapicquirk = 1; + noioapicreroute = -1; +#endif + skip_ioapic_setup = 1; +} + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + _disable_ioapic_support(); + return 0; +} +early_param("noapic", parse_noapic); + +static int io_apic_setup_irq_pin(unsigned int irq, int node, + struct io_apic_irq_attr *attr); + +/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ +void mp_save_irq(struct mpc_intsrc *m) +{ + int i; + + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); + + for (i = 0; i < mp_irq_entries; i++) { + if (!memcmp(&mp_irqs[i], m, sizeof(*m))) + return; + } + + memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m)); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +#ifndef CONFIG_XEN +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *alloc_irq_pin_list(int node) +{ + return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); +} + + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; + +int __init arch_early_irq_init(void) +{ + struct irq_cfg *cfg; + int count, node, i; + + if (!legacy_pic->nr_legacy_irqs) + io_apic_irqs = ~0UL; + + for (i = 0; i < nr_ioapics; i++) { + ioapics[i].saved_registers = + kzalloc(sizeof(struct IO_APIC_route_entry) * + ioapics[i].nr_registers, GFP_KERNEL); + if (!ioapics[i].saved_registers) + pr_err("IOAPIC %d: suspend/resume impossible!\n", i); + } + + cfg = irq_cfgx; + count = ARRAY_SIZE(irq_cfgx); + node = cpu_to_node(0); + + /* Make sure the legacy interrupts are marked in the bitmap */ + irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); + + for (i = 0; i < count; i++) { + irq_set_chip_data(i, &cfg[i]); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + */ + if (i < legacy_pic->nr_legacy_irqs) { + cfg[i].vector = IRQ0_VECTOR + i; + cpumask_set_cpu(0, cfg[i].domain); + } + } + + return 0; +} + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq_get_chip_data(irq); +} + +static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) +{ + struct irq_cfg *cfg; + + cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); + if (!cfg) + return NULL; + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) + goto out_cfg; + if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + goto out_domain; + return cfg; +out_domain: + free_cpumask_var(cfg->domain); +out_cfg: + kfree(cfg); + return NULL; +} + +static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) +{ + if (!cfg) + return; + irq_set_chip_data(at, NULL); + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); +} + +static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) +{ + int res = irq_alloc_desc_at(at, node); + struct irq_cfg *cfg; + + if (res < 0) { + if (res != -EEXIST) + return NULL; + cfg = irq_get_chip_data(at); + if (cfg) + return cfg; + } + + cfg = alloc_irq_cfg(at, node); + if (cfg) + irq_set_chip_data(at, cfg); + else + irq_free_desc(at); + return cfg; +} + +static int alloc_irq_from(unsigned int from, int node) +{ + return irq_alloc_desc_from(from, node); +} + +static void free_irq_at(unsigned int at, struct irq_cfg *cfg) +{ + free_irq_cfg(at, cfg); + irq_free_desc(at); +} + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; + unsigned int unused2[11]; + unsigned int eoi; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mpc_ioapic_addr(idx) & ~PAGE_MASK); +} + +static inline void io_apic_eoi(unsigned int apic, unsigned int vector) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); +} +#endif /* !CONFIG_XEN */ + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +#else + struct physdev_apic apic_op; + int ret; + + apic_op.apic_physbase = mpc_ioapic_addr(apic); + apic_op.reg = reg; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op); + if (ret) + return ret; + return apic_op.value; +#endif +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ +#ifndef CONFIG_XEN + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +#else + struct physdev_apic apic_op; + + apic_op.apic_physbase = mpc_ioapic_addr(apic); + apic_op.reg = reg; + apic_op.value = value; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op)); +#endif +} + +#ifdef CONFIG_XEN +#define io_apic_modify io_apic_write +#else +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register + */ +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + + if (sis_apic_bug) + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + int pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} +#endif /* CONFIG_XEN */ + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +#ifndef CONFIG_XEN +static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + return eu.entry; +} + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); + eu.entry = __ioapic_read_entry(apic, pin); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} +#endif + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + union entry_union eu = {{0, 0}}; + + eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + raw_spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifndef CONFIG_XEN +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static int +__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + struct irq_pin_list **last, *entry; + + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (entry->apic == apic && entry->pin == pin) + return 0; + last = &entry->next; + } + + entry = alloc_irq_pin_list(node); + if (!entry) { + printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } + entry->apic = apic; + entry->pin = pin; + + *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (__add_pin_to_irq_node(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + /* every one is different, right? */ + return; + } + } + + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); +} + +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); +} + +static void io_apic_sync(struct irq_pin_list *entry) +{ + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); + readl(&io_apic->data); +} + +static void mask_ioapic(struct irq_cfg *cfg) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void mask_ioapic_irq(struct irq_data *data) +{ + mask_ioapic(data->chip_data); +} + +static void __unmask_ioapic(struct irq_cfg *cfg) +{ + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); +} + +static void unmask_ioapic(struct irq_cfg *cfg) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __unmask_ioapic(cfg); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_ioapic_irq(struct irq_data *data) +{ + unmask_ioapic(data->chip_data); +} + +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. + */ +static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg) +{ + if (mpc_ioapic_ver(apic) >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (cfg && irq_remapped(cfg)) + io_apic_eoi(apic, pin); + else + io_apic_eoi(apic, vector); + } else { + struct IO_APIC_route_entry entry, entry1; + + entry = entry1 = __ioapic_read_entry(apic, pin); + + /* + * Mask the entry and change the trigger mode to edge. + */ + entry1.mask = 1; + entry1.trigger = IOAPIC_EDGE; + + __ioapic_write_entry(apic, pin, entry1); + + /* + * Restore the previous level triggered entry. + */ + __ioapic_write_entry(apic, pin, entry); + } +} + +static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == dest_SMI) + return; + + /* + * Make sure the entry is masked and re-read the contents to check + * if it is a level triggered pin and if the remote-IRR is set. + */ + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + entry = ioapic_read_entry(apic, pin); + } + + if (entry.irr) { + unsigned long flags; + + /* + * Make sure the trigger mode is set to level. Explicit EOI + * doesn't clear the remote-IRR if the trigger mode is not + * set to level. + */ + if (!entry.trigger) { + entry.trigger = IOAPIC_LEVEL; + ioapic_write_entry(apic, pin, entry); + } + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_pin(apic, pin, entry.vector, NULL); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + } + + /* + * Clear the rest of the bits in the IO-APIC RTE except for the mask + * bit. + */ + ioapic_mask_entry(apic, pin); + entry = ioapic_read_entry(apic, pin); + if (entry.irr) + printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", + mpc_ioapic_id(apic), pin); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + clear_IO_APIC_pin(apic, pin); +} +#else +#define add_pin_to_irq_node(cfg, node, apic, pin) +#define __add_pin_to_irq_node(cfg, node, apic, pin) 0 +#endif /* !CONFIG_XEN */ + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries[MAX_PIRQS] = { + [0 ... MAX_PIRQS - 1] = -1 +}; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +#ifndef CONFIG_XEN +/* + * Saves all the IO-APIC RTE's + */ +int save_ioapic_entries(void) +{ + int apic, pin; + int err = 0; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!ioapics[apic].saved_registers) { + err = -ENOMEM; + continue; + } + + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + ioapics[apic].saved_registers[pin] = + ioapic_read_entry(apic, pin); + } + + return err; +} + +/* + * Mask all IO APIC entries. + */ +void mask_ioapic_entries(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!ioapics[apic].saved_registers) + continue; + + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { + struct IO_APIC_route_entry entry; + + entry = ioapics[apic].saved_registers[pin]; + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + } +} + +/* + * Restore IO APIC entries which was saved in the ioapic structure. + */ +int restore_ioapic_entries(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!ioapics[apic].saved_registers) + continue; + + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + ioapic_write_entry(apic, pin, + ioapics[apic].saved_registers[pin]); + } + return 0; +} +#endif /* CONFIG_XEN */ + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int ioapic_idx, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].irqtype == type && + (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || + mp_irqs[i].dstapic == MP_APIC_ALL) && + mp_irqs[i].dstirq == pin) + return i; + + return -1; +} + +#ifndef CONFIG_XEN +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) + + return mp_irqs[i].dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) + break; + } + + if (i < mp_irq_entries) { + int ioapic_idx; + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) + return ioapic_idx; + } + + return -1; +} +#endif + +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < legacy_pic->nr_legacy_irqs) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) + +static int irq_polarity(int idx) +{ + int bus = mp_irqs[idx].srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); + break; + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int irq_trigger(int idx) +{ + int bus = mp_irqs[idx].srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif + break; + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq; + int bus = mp_irqs[idx].srcbus; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic); + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].srcbusirq; + } else { + u32 gsi = gsi_cfg->gsi_base + pin; + + if (gsi >= NR_IRQS_LEGACY) + irq = gsi; + else + irq = gsi_top + gsi; + } + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + + return irq; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + struct io_apic_irq_attr *irq_attr) +{ + int ioapic_idx, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq); + + if (!(ioapic_idx || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + set_io_apic_irq_attr(irq_attr, ioapic_idx, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + set_io_apic_irq_attr(irq_attr, ioapic_idx, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +#ifndef CONFIG_XEN +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + raw_spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + raw_spin_unlock(&vector_lock); +} + +static int +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +{ + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; + static int current_offset = VECTOR_OFFSET_START % 8; + unsigned int old_vector; + int cpu, err; + cpumask_var_t tmp_mask; + + if (cfg->move_in_progress) + return -EBUSY; + + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; + + old_vector = cfg->vector; + if (old_vector) { + cpumask_and(tmp_mask, mask, cpu_online_mask); + cpumask_and(tmp_mask, cfg->domain, tmp_mask); + if (!cpumask_empty(tmp_mask)) { + free_cpumask_var(tmp_mask); + return 0; + } + } + + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + for_each_cpu_and(cpu, mask, cpu_online_mask) { + int new_cpu; + int vector, offset; + + apic->vector_allocation_domain(cpu, tmp_mask); + + vector = current_vector; + offset = current_offset; +next: + vector += 8; + if (vector >= first_system_vector) { + /* If out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_EXTERNAL_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; + + if (test_bit(vector, used_vectors)) + goto next; + +#ifdef CONFIG_KDB + if (vector == KDBENTER_VECTOR) + goto next; +#endif /* CONFIG_KDB */ + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cpumask_copy(cfg->old_domain, cfg->domain); + } + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; + } + free_cpumask_var(tmp_mask); + return err; +} + +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +{ + int err; + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + err = __assign_irq_vector(irq, cfg, mask); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return err; +} + +static void __clear_irq_vector(int irq, struct irq_cfg *cfg) +{ + int cpu, vector; + + BUG_ON(!cfg->vector); + + vector = cfg->vector; + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + cfg->vector = 0; + cpumask_clear(cfg->domain); + + if (likely(!cfg->move_in_progress)) + return; + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; + vector++) { + if (per_cpu(vector_irq, cpu)[vector] != irq) + continue; + per_cpu(vector_irq, cpu)[vector] = -1; + break; + } + } + cfg->move_in_progress = 0; +} + +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + int irq, vector; + struct irq_cfg *cfg; + + /* + * vector_lock will make sure that we don't run into irq vector + * assignments that might be happening on another cpu in parallel, + * while we setup our initial vector to irq mappings. + */ + raw_spin_lock(&vector_lock); + /* Mark the inuse vectors */ + for_each_active_irq(irq) { + cfg = irq_get_chip_data(irq); + if (!cfg) + continue; + /* + * If it is a legacy IRQ handled by the legacy PIC, this cpu + * will be part of the irq_cfg's domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) + cpumask_set_cpu(cpu, cfg->domain); + + if (!cpumask_test_cpu(cpu, cfg->domain)) + continue; + vector = cfg->vector; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq < 0) + continue; + + cfg = irq_cfg(irq); + if (!cpumask_test_cpu(cpu, cfg->domain)) + per_cpu(vector_irq, cpu)[vector] = -1; + } + raw_spin_unlock(&vector_lock); +} + +static struct irq_chip ioapic_chip; + +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif + +static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, + unsigned long trigger) +{ + struct irq_chip *chip = &ioapic_chip; + irq_flow_handler_t hdl; + bool fasteoi; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + if (irq_remapped(cfg)) { + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); + irq_remap_modify_chip_defaults(chip); + fasteoi = trigger != 0; + } + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + irq_set_chip_and_handler_name(irq, chip, hdl, + fasteoi ? "fasteoi" : "edge"); +} + + +static int setup_ir_ioapic_entry(int irq, + struct IR_IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + int index; + struct irte irte; + int ioapic_id = mpc_ioapic_id(attr->ioapic); + struct intel_iommu *iommu = map_ioapic_to_ir(ioapic_id); + + if (!iommu) { + pr_warn("No mapping iommu for ioapic %d\n", ioapic_id); + return -ENODEV; + } + + index = alloc_irte(iommu, irq, 1); + if (index < 0) { + pr_warn("Failed to allocate IRTE for ioapic %d\n", ioapic_id); + return -ENOMEM; + } + + prepare_irte(&irte, vector, destination); + + /* Set source-id of interrupt request */ + set_ioapic_sid(&irte, ioapic_id); + + modify_irte(irq, &irte); + + apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: " + "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d " + "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X " + "Avail:%X Vector:%02X Dest:%08X " + "SID:%04X SQ:%X SVT:%X)\n", + attr->ioapic, irte.present, irte.fpd, irte.dst_mode, + irte.redir_hint, irte.trigger_mode, irte.dlvry_mode, + irte.avail, irte.vector, irte.dest_id, + irte.sid, irte.sq, irte.svt); + + memset(entry, 0, sizeof(*entry)); + + entry->index2 = (index >> 15) & 0x1; + entry->zero = 0; + entry->format = 1; + entry->index = (index & 0x7fff); + /* + * IO-APIC RTE will be configured with virtual vector. + * irq handler will do the explicit EOI to the io-apic. + */ + entry->vector = attr->ioapic_pin; + entry->mask = 0; /* enable IRQ */ + entry->trigger = attr->trigger; + entry->polarity = attr->polarity; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (attr->trigger) + entry->mask = 1; + + return 0; +} +#else /* !CONFIG_XEN */ +#define __clear_irq_vector(irq, cfg) ((void)0) +#define ioapic_register_intr(irq, cfg, trigger) evtchn_register_pirq(irq) +#endif + +static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ +#ifndef CONFIG_XEN + if (intr_remapping_enabled) + return setup_ir_ioapic_entry(irq, + (struct IR_IO_APIC_route_entry *)entry, + destination, vector, attr); +#endif + + memset(entry, 0, sizeof(*entry)); + + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = destination; + entry->vector = vector; + entry->mask = 0; /* enable IRQ */ + entry->trigger = attr->trigger; + entry->polarity = attr->polarity; + + /* + * Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (attr->trigger) + entry->mask = 1; + + return 0; +} + +static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, + struct io_apic_irq_attr *attr) +{ + struct IO_APIC_route_entry entry; + unsigned int dest; + + if (!IO_APIC_IRQ(irq)) + return; +#ifndef CONFIG_XEN + /* + * For legacy irqs, cfg->domain starts with cpu 0 for legacy + * controllers like 8259. Now that IO-APIC can handle this irq, update + * the cfg->domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + apic->vector_allocation_domain(0, cfg->domain); +#else + /* + * For legacy IRQs we may get here before trigger mode and polarity + * get obtained, but Xen refuses to set those through + * PHYSDEVOP_setup_gsi more than once (perhaps even at all). + */ + if (irq >= legacy_pic->nr_legacy_irqs + || test_bit(attr->ioapic_pin, + ioapics[attr->ioapic].pin_programmed)) { + struct physdev_setup_gsi setup_gsi = { + .gsi = irq, + .triggering = attr->trigger, + .polarity = attr->polarity + }; + struct physdev_map_pirq map_pirq = { + .domid = DOMID_SELF, + .type = MAP_PIRQ_TYPE_GSI, + .index = irq, + .pirq = irq + }; + + switch (HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, + &setup_gsi)) { + case -EEXIST: + if (irq < legacy_pic->nr_legacy_irqs) + break; + /* fall through */ + case 0: + evtchn_register_pirq(irq); + if (HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, + &map_pirq) == 0) { + /* fake (for init_IO_APIC_traps()): */ + cfg->vector = irq; + return; + } + } + } +#endif + + if (assign_irq_vector(irq, cfg, apic->target_cpus())) + return; + +#ifndef CONFIG_XEN + dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); +#else + dest = 0; /* meaningless */ +#endif + + apic_printk(APIC_VERBOSE,KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i Dest:%d)\n", + attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, + cfg->vector, irq, attr->trigger, attr->polarity, dest); + + if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { + pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); + __clear_irq_vector(irq, cfg); + + return; + } + + ioapic_register_intr(irq, cfg, attr->trigger); +#ifndef CONFIG_XEN + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->mask(irq); +#endif + + ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); +} + +static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin) +{ + if (idx != -1) + return false; + + apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic_idx), pin); + return true; +} + +static void __init __io_apic_setup_irqs(unsigned int ioapic_idx) +{ + int idx, node = cpu_to_node(0); + struct io_apic_irq_attr attr; + unsigned int pin, irq; + + for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) { + idx = find_irq_entry(ioapic_idx, pin, mp_INT); + if (io_apic_pin_not_connected(idx, ioapic_idx, pin)) + continue; + + irq = pin_2_irq(idx, ioapic_idx, pin); + + if ((ioapic_idx > 0) && (irq > 16)) + continue; + +#ifdef CONFIG_XEN + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) + continue; +#else + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(ioapic_idx, irq)) + continue; +#endif + + set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), + irq_polarity(idx)); + + io_apic_setup_irq_pin(irq, node, &attr); + } +} + +static void __init setup_IO_APIC_irqs(void) +{ + unsigned int ioapic_idx; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + __io_apic_setup_irqs(ioapic_idx); +} + +/* + * for the gsit that is not in first ioapic + * but could not use acpi_register_gsi() + * like some special sci in IBM x3330 + */ +void setup_IO_APIC_irq_extra(u32 gsi) +{ + int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0); + struct io_apic_irq_attr attr; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic_idx = mp_find_ioapic(gsi); + if (ioapic_idx < 0) + return; + + pin = mp_find_ioapic_pin(ioapic_idx, gsi); + idx = find_irq_entry(ioapic_idx, pin, mp_INT); + if (idx == -1) + return; + + irq = pin_2_irq(idx, ioapic_idx, pin); +#ifdef CONFIG_XEN + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) + return; +#endif + + /* Only handle the non legacy irqs on secondary ioapics */ + if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY) + return; + + set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), + irq_polarity(idx)); + + io_apic_setup_irq_pin_once(irq, node, &attr); +} + +#ifndef CONFIG_XEN +/* + * Set up the timer pin, possibly with the 8259A-master behind. + */ +static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, + unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + + if (intr_remapping_enabled) + return; + + memset(&entry, 0, sizeof(entry)); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = apic->irq_dest_mode; + entry.mask = 0; /* don't mask IRQ for edge */ + entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); + entry.delivery_mode = apic->irq_delivery_mode; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we may have a 8259A-master in AEOI mode ... + */ + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_idx, pin, entry); +} + +__apicdebuginit(void) print_IO_APIC(int ioapic_idx) +{ + int i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + reg_01.raw = io_apic_read(ioapic_idx, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(ioapic_idx, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(ioapic_idx, 3); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("\n"); + printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %02X\n", + reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %02X\n", + reg_01.bits.version); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + if (intr_remapping_enabled) { + printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" + " Pol Stat Indx2 Zero Vect:\n"); + } else { + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect:\n"); + } + + for (i = 0; i <= reg_01.bits.entries; i++) { + if (intr_remapping_enabled) { + struct IO_APIC_route_entry entry; + struct IR_IO_APIC_route_entry *ir_entry; + + entry = ioapic_read_entry(ioapic_idx, i); + ir_entry = (struct IR_IO_APIC_route_entry *) &entry; + printk(KERN_DEBUG " %02x %04X ", + i, + ir_entry->index + ); + printk("%1d %1d %1d %1d %1d " + "%1d %1d %X %02X\n", + ir_entry->format, + ir_entry->mask, + ir_entry->trigger, + ir_entry->irr, + ir_entry->polarity, + ir_entry->delivery_status, + ir_entry->index2, + ir_entry->zero, + ir_entry->vector + ); + } else { + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(ioapic_idx, i); + printk(KERN_DEBUG " %02x %02X ", + i, + entry.dest + ); + printk("%1d %1d %1d %1d %1d " + "%1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } +} + +__apicdebuginit(void) print_IO_APICs(void) +{ + int ioapic_idx; + struct irq_cfg *cfg; + unsigned int irq; + struct irq_chip *chip; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mpc_ioapic_id(ioapic_idx), + ioapics[ioapic_idx].nr_registers); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + print_IO_APIC(ioapic_idx); + + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for_each_active_irq(irq) { + struct irq_pin_list *entry; + + chip = irq_get_chip(irq); + if (chip != &ioapic_chip) + continue; + + cfg = irq_get_chip_data(irq); + if (!cfg) + continue; + entry = cfg->irq_2_pin; + if (!entry) + continue; + printk(KERN_DEBUG "IRQ%d ", irq); + for_each_irq_pin(entry, cfg->irq_2_pin) + printk("-> %d:%d", entry->apic, entry->pin); + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); +} + +__apicdebuginit(void) print_APIC_field(int base) +{ + int i; + + printk(KERN_DEBUG); + + for (i = 0; i < 8; i++) + printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + + printk(KERN_CONT "\n"); +} + +__apicdebuginit(void) print_local_APIC(void *dummy) +{ + unsigned int i, v, ver, maxlvt; + u64 icr; + + printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); + v = apic_read(APIC_LVR); + printk(KERN_INFO "... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = lapic_get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + } + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + } + + v = apic_read(APIC_LDR); + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + } + v = apic_read(APIC_SPIV); + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); + + printk(KERN_DEBUG "... APIC ISR field:\n"); + print_APIC_field(APIC_ISR); + printk(KERN_DEBUG "... APIC TMR field:\n"); + print_APIC_field(APIC_TMR); + printk(KERN_DEBUG "... APIC IRR field:\n"); + print_APIC_field(APIC_IRR); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } + printk("\n"); +} + +__apicdebuginit(void) print_local_APICs(int maxcpu) +{ + int cpu; + + if (!maxcpu) + return; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } + preempt_enable(); +} + +__apicdebuginit(void) print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (!legacy_pic->nr_legacy_irqs) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +static int __initdata show_lapic = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +__apicdebuginit(int) print_ICs(void) +{ + if (apic_verbosity == APIC_QUIET) + return 0; + + print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic && !apic_from_smp_config()) + return 0; + + print_local_APICs(show_lapic); + print_IO_APICs(); + + return 0; +} + +late_initcall(print_ICs); + + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +void __init enable_IO_APIC(void) +{ + int i8259_apic, i8259_pin; + int apic; + + if (!legacy_pic->nr_legacy_irqs) + return; + + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { + struct IO_APIC_route_entry entry; + entry = ioapic_read_entry(apic, pin); + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + if (!legacy_pic->nr_legacy_irqs) + return; + + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + * + * With interrupt-remapping, for now we will use virtual wire A mode, + * as virtual wire B is little complex (need to configure both + * IOAPIC RTE as well as interrupt-remapping table entry). + * As this gets called during crash dump, keep this simple for now. + */ + if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { + struct IO_APIC_route_entry entry; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest = read_apic_id(); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + + /* + * Use virtual wire A mode when interrupt remapping is enabled. + */ + if (cpu_has_apic || apic_from_smp_config()) + disconnect_bsp_APIC(!intr_remapping_enabled && + ioapic_i8259.pin != -1); +} + +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ +void __init setup_ioapic_ids_from_mpc_nocheck(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int ioapic_idx; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) { + /* Read the register 0 value */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mpc_ioapic_id(ioapic_idx); + + if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (apic->check_apicid_used(&phys_id_present_map, + mpc_ioapic_id(ioapic_idx))) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + ioapics[ioapic_idx].mp_config.apicid = i; + } else { + physid_mask_t tmp; + apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx), + &tmp); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mpc_ioapic_id(ioapic_idx)) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].dstapic == old_id) + mp_irqs[i].dstapic + = mpc_ioapic_id(ioapic_idx); + + /* + * Update the ID register according to the right value + * from the MPC table if they are different. + */ + if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) + continue; + + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mpc_ioapic_id(ioapic_idx)); + + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic_idx, 0, reg_00.raw); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} + +void __init setup_ioapic_ids_from_mpc(void) +{ + + if (acpi_ioapic) + return; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + setup_ioapic_ids_from_mpc_nocheck(); +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + unsigned long flags; + + if (no_timer_check) + return 1; + + local_save_flags(flags); + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + local_irq_restore(flags); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (time_after(jiffies, t1 + 4)) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_ioapic_irq(struct irq_data *data) +{ + int was_pending = 0, irq = data->irq; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (irq < legacy_pic->nr_legacy_irqs) { + legacy_pic->mask(irq); + if (legacy_pic->irq_pending(irq)) + was_pending = 1; + } + __unmask_ioapic(data->chip_data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +static int ioapic_retrigger_irq(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +#ifdef CONFIG_SMP +void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(cfg)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + } +} + +/* + * Either sets data->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and + * leaves data->affinity untouched. + */ +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) +{ + struct irq_cfg *cfg = data->chip_data; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -1; + + if (assign_irq_vector(data->irq, data->chip_data, mask)) + return -1; + + cpumask_copy(data->affinity, mask); + + *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain); + return 0; +} + +static int +ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + unsigned int dest, irq = data->irq; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + ret = __ioapic_set_affinity(data, mask, &dest); + if (!ret) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, data->chip_data); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return ret; +} + +#ifdef CONFIG_IRQ_REMAP + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For both level and edge triggered, irq migration is a simple atomic + * update(of vector and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we eliminate the io-apic RTE modification (with the + * updated vector information), by using a virtual vector (io-apic pin number). + * Real vector that is used for interrupting cpu will be coming from + * the interrupt-remapping table entry. + * + * As the migration is a simple atomic update of IRTE, the same mechanism + * is used to migrate MSI irq's in the presence of interrupt-remapping. + */ +static int +ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; + struct irte irte; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + if (get_irte(irq, &irte)) + return -EBUSY; + + if (assign_irq_vector(irq, cfg, mask)) + return -EBUSY; + + dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Atomically updates the IRTE with the new destination, vector + * and flushes the interrupt entry cache. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + cpumask_copy(data->affinity, mask); + return 0; +} + +#else +static inline int +ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + return 0; +} +#endif + +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + + ack_APIC_irq(); + irq_enter(); + exit_idle(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + unsigned int irr; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __this_cpu_read(vector_irq[vector]); + + if (irq == -1) + continue; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + raw_spin_lock(&desc->lock); + + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) + goto unlock; + + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + goto unlock; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + /* + * Check if the vector that needs to be cleanedup is + * registered at the cpu's IRR. If so, then this is not + * the best time to clean it up. Lets clean it up in the + * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR + * to myself. + */ + if (irr & (1 << (vector % 32))) { + apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + goto unlock; + } + __this_cpu_write(vector_irq[vector], -1); +unlock: + raw_spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +{ + unsigned me; + + if (likely(!cfg->move_in_progress)) + return; + + me = smp_processor_id(); + + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + send_cleanup_vector(cfg); +} + +static void irq_complete_move(struct irq_cfg *cfg) +{ + __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_cfg *cfg = irq_get_chip_data(irq); + + if (!cfg) + return; + + __irq_complete_move(cfg, cfg->vector); +} +#else +static inline void irq_complete_move(struct irq_cfg *cfg) { } +#endif + +static void ack_apic_edge(struct irq_data *data) +{ + irq_complete_move(data->chip_data); + irq_move_irq(data); + ack_APIC_irq(); +} + +atomic_t irq_mis_count; + +static void ack_apic_level(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + int i, do_unmask_irq = 0, irq = data->irq; + unsigned long v; + + irq_complete_move(cfg); +#ifdef CONFIG_GENERIC_PENDING_IRQ + /* If we are moving the irq we need to mask it */ + if (unlikely(irqd_is_setaffinity_pending(data))) { + do_unmask_irq = 1; + mask_ioapic(cfg); + } +#endif + + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. + */ + i = cfg->vector; + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + eoi_ioapic_irq(irq, cfg); + } + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(cfg)) + irq_move_masked_irq(data); + unmask_ioapic(cfg); + } +} + +#ifdef CONFIG_IRQ_REMAP +static void ir_ack_apic_edge(struct irq_data *data) +{ + ack_APIC_irq(); +} + +static void ir_ack_apic_level(struct irq_data *data) +{ + ack_APIC_irq(); + eoi_ioapic_irq(data->irq, data->chip_data); +} + +static void ir_print_prefix(struct irq_data *data, struct seq_file *p) +{ + seq_printf(p, " IR-%s", data->chip->name); +} + +static void irq_remap_modify_chip_defaults(struct irq_chip *chip) +{ + chip->irq_print_chip = ir_print_prefix; + chip->irq_ack = ir_ack_apic_edge; + chip->irq_eoi = ir_ack_apic_level; + +#ifdef CONFIG_SMP + chip->irq_set_affinity = ir_ioapic_set_affinity; +#endif +} +#endif /* CONFIG_IRQ_REMAP */ + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = ack_apic_edge, + .irq_eoi = ack_apic_level, +#ifdef CONFIG_SMP + .irq_set_affinity = ioapic_set_affinity, +#endif + .irq_retrigger = ioapic_retrigger_irq, +}; +#endif /* !CONFIG_XEN */ + +static inline void init_IO_APIC_traps(void) +{ + struct irq_cfg *cfg; + unsigned int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for_each_active_irq(irq) { +#ifdef CONFIG_XEN + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) + continue; +#endif + cfg = irq_get_chip_data(irq); + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->make_irq(irq); + else + /* Strange. Oh, well.. */ + irq_set_chip(irq, &no_irq_chip); + } + } +} + +#ifndef CONFIG_XEN +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(struct irq_data *data) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void unmask_lapic_irq(struct irq_data *data) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void ack_lapic_irq(struct irq_data *data) +{ + ack_APIC_irq(); +} + +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .irq_mask = mask_lapic_irq, + .irq_unmask = unmask_lapic_irq, + .irq_ack = ack_lapic_irq, +}; + +static void lapic_register_intr(int irq) +{ + irq_clear_status_flags(irq, IRQ_LEVEL); + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void __init unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } + apic = find_isa_irq_apic(8, mp_INT); + if (apic == -1) { + WARN_ON_ONCE(1); + return; + } + + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + ioapic_write_entry(apic, pin, entry1); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + ioapic_write_entry(apic, pin, entry0); +} + +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. + */ +static inline void __init check_timer(void) +{ + struct irq_cfg *cfg = irq_get_chip_data(0); + int node = cpu_to_node(0); + int apic1, pin1, apic2, pin2; + unsigned long flags; + int no_pin1 = 0; + + local_irq_save(flags); + + /* + * get/set the timer IRQ vector: + */ + legacy_pic->mask(0); + assign_irq_vector(0, cfg, apic->target_cpus()); + + /* + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + legacy_pic->init(1); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); + + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + if (no_pin1) { + add_pin_to_irq_node(cfg, node, apic1, pin1); + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + } else { + /* for edge trigger, setup_ioapic_irq already + * leave it unmasked. + * so only need to unmask if it is level-trigger + * do we really have level trigger timer? + */ + int idx; + idx = find_irq_entry(apic1, pin1, mp_INT); + if (idx != -1 && irq_trigger(idx)) + unmask_ioapic(cfg); + } + if (timer_irq_works()) { + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + goto out; + } + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); + local_irq_disable(); + clear_IO_APIC_pin(apic1, pin1); + if (!no_pin1) + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); + + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + legacy_pic->unmask(0); + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + timer_through_8259 = 1; + goto out; + } + /* + * Cleanup, just in case ... + */ + local_irq_disable(); + legacy_pic->mask(0); + clear_IO_APIC_pin(apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + } + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); + + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ + legacy_pic->unmask(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + local_irq_disable(); + legacy_pic->mask(0); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); + + legacy_pic->init(0); + legacy_pic->make_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + local_irq_disable(); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + if (x2apic_preenabled) + apic_printk(APIC_QUIET, KERN_INFO + "Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); +out: + local_irq_restore(flags); +} +#else +#define check_timer() ((void)0) +#endif + +/* + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro + */ +#define PIC_IRQS (1UL << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ + io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + /* + * Set up IO-APIC IRQ routing. + */ +#ifndef CONFIG_XEN + x86_init.mpparse.setup_ioapic_ids(); + + sync_Arb_IDs(); +#endif + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + if (legacy_pic->nr_legacy_irqs) + check_timer(); +} + +/* + * Called after all the initialization is done. If we didn't find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; +#ifdef CONFIG_X86_XEN + if (is_initial_xendomain()) { + struct xen_platform_op op = { .cmd = XENPF_platform_quirk }; + op.u.platform_quirk.quirk_id = sis_apic_bug ? + QUIRK_IOAPIC_BAD_REGSEL : QUIRK_IOAPIC_GOOD_REGSEL; + VOID(HYPERVISOR_platform_op(&op)); + } +#endif + return 0; +} + +late_initcall(io_apic_bug_finalize); + +#ifndef CONFIG_XEN +static void resume_ioapic_id(int ioapic_idx) +{ + unsigned long flags; + union IO_APIC_reg_00 reg_00; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + io_apic_write(ioapic_idx, 0, reg_00.raw); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void ioapic_resume(void) +{ + int ioapic_idx; + + for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--) + resume_ioapic_id(ioapic_idx); + + restore_ioapic_entries(); +} + +static struct syscore_ops ioapic_syscore_ops = { + .suspend = save_ioapic_entries, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_ops(void) +{ + register_syscore_ops(&ioapic_syscore_ops); + + return 0; +} + +device_initcall(ioapic_init_ops); + +/* + * Dynamic irq allocate and deallocation + */ +unsigned int create_irq_nr(unsigned int from, int node) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int ret = 0; + int irq; + + if (from < nr_irqs_gsi) + from = nr_irqs_gsi; + + irq = alloc_irq_from(from, node); + if (irq < 0) + return 0; + cfg = alloc_irq_cfg(irq, node); + if (!cfg) { + free_irq_at(irq, NULL); + return 0; + } + + raw_spin_lock_irqsave(&vector_lock, flags); + if (!__assign_irq_vector(irq, cfg, apic->target_cpus())) + ret = irq; + raw_spin_unlock_irqrestore(&vector_lock, flags); + + if (ret) { + irq_set_chip_data(irq, cfg); + irq_clear_status_flags(irq, IRQ_NOREQUEST); + } else { + free_irq_at(irq, cfg); + } + return ret; +} + +int create_irq(void) +{ + int node = cpu_to_node(0); + unsigned int irq_want; + int irq; + + irq_want = nr_irqs_gsi; + irq = create_irq_nr(irq_want, node); + + if (irq == 0) + irq = -1; + + return irq; +} + +void destroy_irq(unsigned int irq) +{ + struct irq_cfg *cfg = irq_get_chip_data(irq); + unsigned long flags; + + irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); + + if (irq_remapped(cfg)) + free_irte(irq); + raw_spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq, cfg); + raw_spin_unlock_irqrestore(&vector_lock, flags); + free_irq_at(irq, cfg); +} +#endif /* !CONFIG_XEN */ + +/* + * MSI message composition + */ +#if defined(CONFIG_PCI_MSI) && !defined(CONFIG_XEN) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_cfg *cfg; + int err; + unsigned dest; + + if (disable_apic) + return -ENXIO; + + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); + if (err) + return err; + + dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + + if (irq_remapped(cfg)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + prepare_irte(&irte, cfg->vector, dest); + + /* Set source-id of interrupt request */ + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); + + modify_irte(irq, &irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else { + if (x2apic_enabled()) + msg->address_hi = MSI_ADDR_BASE_HI | + MSI_ADDR_EXT_DEST_ID(dest); + else + msg->address_hi = MSI_ADDR_BASE_HI; + + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + } + return err; +} + +#ifdef CONFIG_SMP +static int +msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = data->chip_data; + struct msi_msg msg; + unsigned int dest; + + if (__ioapic_set_affinity(data, mask, &dest)) + return -1; + + __get_cached_msi_msg(data->msi_desc, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + __write_msi_msg(data->msi_desc, &msg); + + return 0; +} +#endif /* CONFIG_SMP */ + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .irq_unmask = unmask_msi_irq, + .irq_mask = mask_msi_irq, + .irq_ack = ack_apic_edge, +#ifdef CONFIG_SMP + .irq_set_affinity = msi_set_affinity, +#endif + .irq_retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) +{ + struct irq_chip *chip = &msi_chip; + struct msi_msg msg; + int ret; + + ret = msi_compose_msg(dev, irq, &msg, -1); + if (ret < 0) + return ret; + + irq_set_msi_desc(irq, msidesc); + write_msi_msg(irq, &msg); + + if (irq_remapped(irq_get_chip_data(irq))) { + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); + irq_remap_modify_chip_defaults(chip); + } + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); + + return 0; +} + +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + int node, ret, sub_handle, index = 0; + unsigned int irq, irq_want; + struct msi_desc *msidesc; + struct intel_iommu *iommu = NULL; + + /* x86 doesn't support multiple MSI yet */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + + node = dev_to_node(&dev->dev); + irq_want = nr_irqs_gsi; + sub_handle = 0; + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want, node); + if (irq == 0) + return -1; + irq_want = irq + 1; + if (!intr_remapping_enabled) + goto no_ir; + + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: + ret = setup_msi_irq(dev, msidesc, irq); + if (ret < 0) + goto error; + sub_handle++; + } + return 0; + +error: + destroy_irq(irq); + return ret; +} + +void native_teardown_msi_irq(unsigned int irq) +{ + destroy_irq(irq); +} + +#ifdef CONFIG_DMAR_TABLE +#ifdef CONFIG_SMP +static int +dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; + struct msi_msg msg; + + if (__ioapic_set_affinity(data, mask, &dest)) + return -1; + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + + return 0; +} + +#endif /* CONFIG_SMP */ + +static struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = ack_apic_edge, +#ifdef CONFIG_SMP + .irq_set_affinity = dmar_msi_set_affinity, +#endif + .irq_retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg, -1); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + +#ifdef CONFIG_HPET_TIMER + +#ifdef CONFIG_SMP +static int hpet_msi_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = data->chip_data; + struct msi_msg msg; + unsigned int dest; + + if (__ioapic_set_affinity(data, mask, &dest)) + return -1; + + hpet_msi_read(data->handler_data, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + hpet_msi_write(data->handler_data, &msg); + + return 0; +} + +#endif /* CONFIG_SMP */ + +static struct irq_chip hpet_msi_type = { + .name = "HPET_MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = ack_apic_edge, +#ifdef CONFIG_SMP + .irq_set_affinity = hpet_msi_set_affinity, +#endif + .irq_retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_hpet_msi(unsigned int irq, unsigned int id) +{ + struct irq_chip *chip = &hpet_msi_type; + struct msi_msg msg; + int ret; + + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + } + + ret = msi_compose_msg(NULL, irq, &msg, id); + if (ret < 0) + return ret; + + hpet_msi_write(irq_get_handler_data(irq), &msg); + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); + if (irq_remapped(irq_get_chip_data(irq))) + irq_remap_modify_chip_defaults(chip); + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); + return 0; +} +#endif + +#endif /* CONFIG_PCI_MSI */ +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +#ifdef CONFIG_SMP + +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static int +ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int dest; + + if (__ioapic_set_affinity(data, mask, &dest)) + return -1; + + target_ht_irq(data->irq, dest, cfg->vector); + return 0; +} + +#endif + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .irq_mask = mask_ht_irq, + .irq_unmask = unmask_ht_irq, + .irq_ack = ack_apic_edge, +#ifdef CONFIG_SMP + .irq_set_affinity = ht_set_affinity, +#endif + .irq_retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + struct irq_cfg *cfg; + int err; + + if (disable_apic) + return -ENXIO; + + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); + if (!err) { + struct ht_irq_msg msg; + unsigned dest; + + dest = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus()); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((apic->irq_dest_mode == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + irq_set_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); + } + return err; +} +#endif /* CONFIG_HT_IRQ */ + +static int +io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) +{ + struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); + int ret; + + if (!cfg) + return -EINVAL; + ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); + if (!ret) + setup_ioapic_irq(irq, cfg, attr); + return ret; +} + +int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) +{ + unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; + int ret; + + /* Avoid redundant programming */ + if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mpc_ioapic_id(ioapic_idx), pin); + return 0; + } + ret = io_apic_setup_irq_pin(irq, node, attr); + if (!ret) + set_bit(pin, ioapics[ioapic_idx].pin_programmed); + return ret; +} + +static int __init io_apic_get_redir_entries(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* The register returns the maximum index redir index + * supported, which is one less than the total number of redir + * entries. + */ + return reg_01.bits.entries + 1; +} + +#ifndef CONFIG_XEN +static void __init probe_nr_irqs_gsi(void) +{ + int nr; + + nr = gsi_top + NR_IRQS_LEGACY; + if (nr > nr_irqs_gsi) + nr_irqs_gsi = nr; + + printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); +} + +int get_nr_irqs_gsi(void) +{ + return nr_irqs_gsi; +} + +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = nr_irqs_gsi + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) + /* + * for MSI and HT dyn irq + */ + nr += nr_irqs_gsi * 16; +#endif + if (nr < nr_irqs) + nr_irqs = nr; + + return NR_IRQS_LEGACY; +} +#endif /* CONFIG_XEN */ + +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + int node; + +#ifdef CONFIG_XEN + if (irq < PIRQ_BASE || irq >= PIRQ_BASE + nr_pirqs) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ %d\n", + irq_attr->ioapic, irq); + return -EINVAL; + } +#endif + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + irq_attr->ioapic); + return -EINVAL; + } + + node = dev ? dev_to_node(dev) : cpu_to_node(0); + + return io_apic_setup_irq_pin_once(irq, node, irq_attr); +} + +#ifdef CONFIG_X86_32 +#ifndef CONFIG_XEN +static int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (apic->check_apicid_used(&apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!apic->check_apicid_used(&apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + apic->apicid_to_cpu_present(apic_id, &tmp); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} +#endif + +static u8 __init io_apic_unique_id(u8 id) +{ +#ifndef CONFIG_XEN + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else +#endif + return id; +} +#else +static u8 __init io_apic_unique_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + __set_bit(mpc_ioapic_id(i), used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} +#endif + +static int __init io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) +{ + int ioapic, pin, idx; + + if (skip_ioapic_setup) + return -1; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_trigger(idx); + *polarity = irq_polarity(idx); + return 0; +} + +#ifndef CONFIG_XEN +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be apic->target_cpus() + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + const struct cpumask *mask; + struct irq_data *idata; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) + for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + if ((ioapic > 0) && (irq > 16)) + continue; + + idata = irq_get_irq_data(irq); + + /* + * Honour affinities which have been set in early boot + */ + if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) + mask = idata->affinity; + else + mask = apic->target_cpus(); + + if (intr_remapping_enabled) + ir_ioapic_set_affinity(idata, mask, false); + else + ioapic_set_affinity(idata, mask, false); + } + +} +#endif + +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(int nr_ioapics) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + + ioapic_resources = res; + + return res; +} + +void __init ioapic_and_gsi_init(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(nr_ioapics); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mpc_ioapic_addr(i); +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif + } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); + idx++; + + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; + ioapic_res++; + } + + probe_nr_irqs_gsi(); +} + +void __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + if (nr_ioapics > 0) + printk(KERN_ERR + "IO APIC resources couldn't be allocated.\n"); + return; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } +} +#endif /* !CONFIG_XEN */ + +int mp_find_ioapic(u32 gsi) +{ + int i = 0; + + if (nr_ioapics == 0) + return -1; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if ((gsi >= gsi_cfg->gsi_base) + && (gsi <= gsi_cfg->gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, u32 gsi) +{ + struct mp_ioapic_gsi *gsi_cfg; + + if (WARN_ON(ioapic == -1)) + return -1; + + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if (WARN_ON(gsi > gsi_cfg->gsi_end)) + return -1; + + return gsi - gsi_cfg->gsi_base; +} + +static __init int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " + "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); + return 1; + } + if (!address) { + printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + int entries; + struct mp_ioapic_gsi *gsi_cfg; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + ioapics[idx].mp_config.type = MP_IOAPIC; + ioapics[idx].mp_config.flags = MPC_APIC_USABLE; + ioapics[idx].mp_config.apicaddr = address; + +#ifndef CONFIG_XEN + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); +#endif + ioapics[idx].mp_config.apicid = io_apic_unique_id(id); + ioapics[idx].mp_config.apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + entries = io_apic_get_redir_entries(idx); + gsi_cfg = mp_ioapic_gsi_routing(idx); + gsi_cfg->gsi_base = gsi_base; + gsi_cfg->gsi_end = gsi_base + entries - 1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + ioapics[idx].nr_registers = entries; + + if (gsi_cfg->gsi_end >= gsi_top) + gsi_top = gsi_cfg->gsi_end + 1; + + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); + + nr_ioapics++; +} + +#ifdef CONFIG_X86_MRST +/* Enable IOAPIC early just for system timer */ +void __init pre_init_apic_IRQ0(void) +{ + struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; + + printk(KERN_INFO "Early APIC setup for system timer0\n"); +#ifndef CONFIG_SMP + physid_set_mask_of_physid(boot_cpu_physical_apicid, + &phys_cpu_present_map); +#endif + setup_local_APIC(); + + io_apic_setup_irq_pin(0, 0, &attr); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); +} +#endif diff --git a/arch/x86/kernel/apic/ipi-xen.c b/arch/x86/kernel/apic/ipi-xen.c new file mode 100644 index 0000000..a3ee607 --- /dev/null +++ b/arch/x86/kernel/apic/ipi-xen.c @@ -0,0 +1,43 @@ +#include +#include + +#include +#include + +#ifdef CONFIG_SMP +#include + +void xen_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + + WARN_ON(!cpumask_subset(cpumask, cpu_online_mask)); + for_each_cpu_and(cpu, cpumask, cpu_online_mask) + if (cpu != this_cpu) + notify_remote_via_ipi(vector, cpu); +} + +void xen_send_IPI_mask(const struct cpumask *cpumask, int vector) +{ + unsigned int cpu; + + WARN_ON(!cpumask_subset(cpumask, cpu_online_mask)); + for_each_cpu_and(cpu, cpumask, cpu_online_mask) + notify_remote_via_ipi(vector, cpu); +} + +void xen_send_IPI_allbutself(int vector) +{ + xen_send_IPI_mask_allbutself(cpu_online_mask, vector); +} + +void xen_send_IPI_all(int vector) +{ + xen_send_IPI_mask(cpu_online_mask, vector); +} + +void xen_send_IPI_self(int vector) +{ + notify_remote_via_ipi(vector, smp_processor_id()); +} +#endif diff --git a/arch/x86/kernel/apic/probe_32-xen.c b/arch/x86/kernel/apic/probe_32-xen.c new file mode 100644 index 0000000..8602fa9 --- /dev/null +++ b/arch/x86/kernel/apic/probe_32-xen.c @@ -0,0 +1,57 @@ +/* + * Default generic APIC driver. This handles up to 8 CPUs. + * + * Copyright 2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License, v.2 + * + * Generic x86 APIC driver probe layer. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static int xen_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic; +} + +static struct apic apic_xen = { + + .name = "default", + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = default_target_cpus, + + .phys_pkg_id = xen_phys_pkg_id, + +#ifdef CONFIG_SMP + .send_IPI_mask = xen_send_IPI_mask, + .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, + .send_IPI_allbutself = xen_send_IPI_allbutself, + .send_IPI_all = xen_send_IPI_all, + .send_IPI_self = xen_send_IPI_self, +#endif +}; + +struct apic *apic = &apic_xen; +EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 68de2dc..3d01fce 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -17,7 +17,7 @@ #include #include -#ifdef CONFIG_XEN +#if defined(CONFIG_XEN) || defined(CONFIG_PARAVIRT_XEN) #include #endif @@ -55,7 +55,7 @@ void common(void) { OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 85d98ab..cf6cc38 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,7 +1,9 @@ #include +#ifdef CONFIG_LGUEST_GUEST #include #include "../../../drivers/lguest/lg.h" +#endif #define __SYSCALL_I386(nr, sym, compat) [nr] = 1, static char syscalls[] = { @@ -60,9 +62,19 @@ void foo(void) OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); +#ifndef CONFIG_X86_NO_TSS /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - + DEFINE(SYSENTER_stack_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); +#else + /* sysenter stack points directly to sp0 */ + DEFINE(SYSENTER_stack_sp0, 0); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_START_mfn_list, start_info, mfn_list); +#endif #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 834e897..8d13cc2 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -70,8 +70,10 @@ int main(void) BLANK(); #undef ENTRY +#ifndef CONFIG_X86_NO_TSS OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); +#endif DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2..fefe4b9 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -40,6 +40,9 @@ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o +disabled-obj-$(CONFIG_XEN) := hypervisor.o mshyperv.o perfctr-watchdog.o \ + perf_event.o perf_event_%.o sched.o vmware.o + quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f4773f4..a5b9661 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -333,7 +333,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) int amd_get_nb_id(int cpu) { int id = 0; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) id = per_cpu(cpu_llc_id, cpu); #endif return id; @@ -467,7 +467,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) (c->x86_model == 8 && c->x86_mask >= 8)) set_cpu_cap(c, X86_FEATURE_K6_MTRR); #endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) && !defined(CONFIG_XEN) /* check CPU config space for extended APIC ID */ if (cpu_has_apic && c->x86 >= 0xf) { unsigned int val; @@ -480,7 +480,9 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) static void __cpuinit init_amd(struct cpuinfo_x86 *c) { +#ifndef CONFIG_XEN u32 dummy; +#endif #ifdef CONFIG_SMP unsigned long long value; @@ -525,18 +527,26 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) u64 val; clear_cpu_cap(c, X86_FEATURE_LAHF_LM); +#ifndef CONFIG_XEN if (!rdmsrl_amd_safe(0xc001100d, &val)) { val &= ~(1ULL << 32); wrmsrl_amd_safe(0xc001100d, val); } +#else + pr_warning("Long-mode LAHF feature wrongly enabled -" + "hypervisor update needed\n"); + (void)&val; +#endif } } if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#ifndef CONFIG_XEN /* get apicid instead of initial apic id from cpuid */ c->apicid = hard_smp_processor_id(); +#endif #else /* @@ -612,6 +622,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) fam10h_check_enable_mmcfg(); } +#ifndef CONFIG_XEN if (c == &boot_cpu_data && c->x86 >= 0xf) { unsigned long long tseg; @@ -631,6 +642,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } #endif +#endif /* * Family 0x12 and above processors have APIC timer @@ -639,6 +651,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); +#ifndef CONFIG_XEN /* * Disable GART TLB Walk Errors on Fam10h. We do this here * because this is always needed when GART is enabled, even in a @@ -662,6 +675,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 46674fb..09087bd2 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,6 +17,7 @@ #include #include +#ifndef CONFIG_XEN static int __init no_halt(char *s) { WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n"); @@ -25,6 +26,7 @@ static int __init no_halt(char *s) } __setup("no-hlt", no_halt); +#endif static int __init no_387(char *s) { @@ -84,13 +86,16 @@ static void __init check_fpu(void) kernel_fpu_end(); +#ifndef CONFIG_XEN boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); +#endif } static void __init check_hlt(void) { +#ifndef CONFIG_XEN if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; @@ -104,6 +109,7 @@ static void __init check_hlt(void) halt(); halt(); printk(KERN_CONT "OK.\n"); +#endif } /* diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c index 04f0fe5..25a2cda 100644 --- a/arch/x86/kernel/cpu/bugs_64.c +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -20,6 +20,7 @@ void __init check_bugs(void) #endif alternative_instructions(); +#ifndef CONFIG_XEN /* * Make sure the first 2MB area is not mapped by huge pages * There are typically fixed size MTRRs in there and overlapping @@ -30,4 +31,5 @@ void __init check_bugs(void) */ if (!direct_gbpages) set_memory_4k((unsigned long)__va(0), 1); +#endif } diff --git a/arch/x86/kernel/cpu/common-xen.c b/arch/x86/kernel/cpu/common-xen.c new file mode 100644 index 0000000..05e2af1 --- /dev/null +++ b/arch/x86/kernel/cpu/common-xen.c @@ -0,0 +1,1435 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +#include +#endif + +#ifdef CONFIG_XEN +#include +#endif + +#include "cpu.h" + +/* all of these masks are initialized in setup_cpu_local_masks() */ +cpumask_var_t cpu_initialized_mask; +#ifndef CONFIG_XEN +cpumask_var_t cpu_callout_mask; +cpumask_var_t cpu_callin_mask; + +/* representing cpus for which sibling maps can be computed */ +cpumask_var_t cpu_sibling_setup_mask; +#endif + +/* correctly size the local cpu masks */ +void __init setup_cpu_local_masks(void) +{ + alloc_bootmem_cpumask_var(&cpu_initialized_mask); +#ifndef CONFIG_XEN + alloc_bootmem_cpumask_var(&cpu_callin_mask); + alloc_bootmem_cpumask_var(&cpu_callout_mask); + alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +#endif +} + +static void __cpuinit default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + cpu_detect_cache_sizes(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev __cpuinitconst default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; + +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +#ifdef CONFIG_X86_64 + /* + * We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + * + * TLS descriptors are currently at a different place compared to i386. + * Hopefully nobody expects them at a fixed place (Wine?) + */ + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), +#else + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), +#ifndef CONFIG_XEN + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), + + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), +#endif + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + GDT_STACK_CANARY_INIT +#endif +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + +static int __init x86_xsave_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsave", x86_xsave_setup); + +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + +#ifdef CONFIG_X86_32 +static int cachesize_override __cpuinitdata = -1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + +static int __init x86_sep_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_SEP); + return 1; +} +__setup("nosep", x86_sep_setup); +#endif + +#if defined(CONFIG_X86_32) && !defined(CONFIG_XEN) +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "movl %0, %1 \n\t" + "xorl %2, %0 \n\t" + "pushl %0 \n\t" + "popfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "popfl \n\t" + + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +static int __cpuinit have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +static int disable_x86_serial_nr __cpuinitdata = 1; + +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + unsigned long lo, hi; + + if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr) + return; + + /* Disable processor serial number: */ + + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); +#else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} +/* Probe for the CPUID instruction */ +static inline int have_cpuid_p(void) +{ + return 1; +} +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} +#endif + +static int disable_smep __cpuinitdata; +static __init int setup_disable_smep(char *arg) +{ + disable_smep = 1; + return 1; +} +__setup("nosmep", setup_disable_smep); + +static __cpuinit void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) { + if (unlikely(disable_smep)) { + setup_clear_cpu_cap(X86_FEATURE_SMEP); + clear_in_cr4(X86_CR4_SMEP); + } else + set_in_cr4(X86_CR4_SMEP); + } +} + +/* + * Some CPU features depend on higher CPUID levels, which may not always + * be available due to CPUID level capping or broken virtualization + * software. Add those features to this table to auto-disable them. + */ +struct cpuid_dependent_feature { + u32 feature; + u32 level; +}; + +static const struct cpuid_dependent_feature __cpuinitconst +cpuid_dependent_features[] = { + { X86_FEATURE_MWAIT, 0x00000005 }, + { X86_FEATURE_DCA, 0x00000009 }, + { X86_FEATURE_XSAVE, 0x0000000d }, + { 0, 0 } +}; + +static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +{ + const struct cpuid_dependent_feature *df; + + for (df = cpuid_dependent_features; df->feature; df++) { + + if (!cpu_has(c, df->feature)) + continue; + /* + * Note: cpuid_level is set to -1 if unavailable, but + * extended_extended_level is set to 0 if unavailable + * and the legitimate extended levels are all negative + * when signed; hence the weird messing around with + * signs here... + */ + if (!((s32)df->level < 0 ? + (u32)df->level > (u32)c->extended_cpuid_level : + (s32)df->level > (s32)c->cpuid_level)) + continue; + + clear_cpu_cap(c, df->feature); + if (!warn) + continue; + + printk(KERN_WARNING + "CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); + } +} + +/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this + * isn't used + */ + +/* Look up CPU names by table lookup. */ +static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) +{ + const struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + +__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; + +void __ref load_percpu_segment(int cpu) +{ +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT + static bool done; + + if (!done) { + done = true; + adjust_boot_vcpu_info(); + } +#endif +#ifdef CONFIG_X86_32 + loadsegment(fs, __KERNEL_PERCPU); +#else + loadsegment(gs, 0); +#ifndef CONFIG_XEN + wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); +#else + if (HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, + (unsigned long)per_cpu(irq_stack_union.gs_base, cpu))) + BUG(); +#endif +#endif + load_stack_canary_segment(); +} + +/* + * Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. + */ +void switch_to_new_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + unsigned long va, frames[16]; + int f; + + gdt_descr.address = (long)get_cpu_gdt_table(cpu); + gdt_descr.size = GDT_SIZE - 1; + + for (va = gdt_descr.address, f = 0; + va < gdt_descr.address + gdt_descr.size; + va += PAGE_SIZE, f++) { + frames[f] = arbitrary_virt_to_mfn(va); + make_page_readonly((void *)va, + XENFEAT_writable_descriptor_tables); + } + if (HYPERVISOR_set_gdt(frames, (gdt_descr.size + 1) / 8)) + BUG(); + + /* Reload the per-cpu base */ + + load_percpu_segment(cpu); +} + +static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; + +static void __cpuinit get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + char *p, *q; + + if (c->extended_cpuid_level < 0x80000004) + return; + + v = (unsigned int *)c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* + * Intel chips right-justify this string for some dumb reason; + * undo that brain damage: + */ + p = q = &c->x86_model_id[0]; + while (*p == ' ') + p++; + if (p != q) { + while (*p) + *q++ = *p++; + while (q <= &c->x86_model_id[48]) + *q++ = '\0'; /* Zero-pad the rest */ + } +} + +void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, ebx, ecx, edx, l2size; + + n = c->extended_cpuid_level; + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); + c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; +#endif + } + + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; + + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + l2size = ecx >> 16; + +#ifdef CONFIG_X86_64 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else + /* do processor-specific cache resizing */ + if (this_cpu->c_size_cache) + l2size = this_cpu->c_size_cache(c, l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if (l2size == 0) + return; /* Again, no L2 cache is possible */ +#endif + + c->x86_cache_size = l2size; +} + +void __cpuinit detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; + static bool printed; + + if (!cpu_has(c, X86_FEATURE_HT)) + return; + + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; + + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); + goto out; + } + + if (smp_num_siblings <= 1) + goto out; + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); + +out: + if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } +#endif +} + +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) +{ + char *v = c->x86_vendor_id; + int i; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; + } + } + + printk_once(KERN_ERR + "CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); + + c->x86_vendor = X86_VENDOR_UNKNOWN; + this_cpu = &default_cpu; +} + +void __cpuinit cpu_detect(struct cpuinfo_x86 *c) +{ + /* Get vendor name */ + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + c->x86 = 4; + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; + + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; + + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xf) << 4; + + if (cap0 & (1<<19)) { + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; + } + } +} + +void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + u32 ebx; + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } + + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + + c->x86_capability[9] = ebx; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + } + + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; +#endif + + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + + init_scattered_cpuid_features(c); +} + +static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + int i; + + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + + for (i = 0; i < X86_VENDOR_NUM; i++) + if (cpu_devs[i] && cpu_devs[i]->c_identify) { + c->x86_vendor_id[0] = 0; + cpu_devs[i]->c_identify(c); + if (c->x86_vendor_id[0]) { + get_cpu_vendor(c); + break; + } + } +#endif +} + +/* + * Do minimum CPU detection early. + * Fields really needed: vendor, cpuid_level, family, model, mask, + * cache alignment. + * The others are not touched to avoid unwanted side effects. + * + * WARNING: this function is only called on the BP. Don't add code here + * that is supposed to run on all CPUs. + */ +static void __init early_identify_cpu(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; +#else + c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c); + + get_cpu_cap(c); +#ifdef CONFIG_XEN + if (!cpu_has_xsave) + x86_xsave_setup(NULL); +#endif + + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); + + c->cpu_index = 0; + filter_cpuid_features(c, false); + + setup_smep(c); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); +} + +void __init early_cpu_init(void) +{ + const struct cpu_dev *const *cdev; + int count = 0; + +#ifdef CONFIG_PROCESSOR_SELECT + printk(KERN_INFO "KERNEL supported cpus:\n"); +#endif + + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + const struct cpu_dev *cpudev = *cdev; + + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + +#ifdef CONFIG_PROCESSOR_SELECT + { + unsigned int j; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } + } +#endif + } + early_identify_cpu(&boot_cpu_data); +} + +/* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + clear_cpu_cap(c, X86_FEATURE_NOPL); +#else + set_cpu_cap(c, X86_FEATURE_NOPL); +#endif +} + +static void __cpuinit generic_identify(struct cpuinfo_x86 *c) +{ + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c); + + get_cpu_cap(c); + +#ifndef CONFIG_XEN + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_HT + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +# else + c->apicid = c->initial_apicid; +# endif +#endif + c->phys_proc_id = c->initial_apicid; + } +#endif + + setup_smep(c); + + get_model_name(c); /* Default name */ + + detect_nopl(c); +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ +#ifndef CONFIG_XEN + c->x86_max_cores = 1; + c->x86_coreid_bits = 0; +#endif +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; +#else + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + if (boot_cpu_has(X86_FEATURE_SYSCALL32)) + set_cpu_cap(c, X86_FEATURE_SYSCALL32); + + generic_identify(c); + + if (this_cpu->c_identify) + this_cpu->c_identify(c); + + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +#endif + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* + * The vendor-specific functions might have changed features. + * Now we do "generic changes." + */ + + /* Filter out anything that depends on CPUID levels we don't have */ + filter_cpuid_features(c, true); + + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { + const char *p; + p = table_lookup_model(c); + if (p) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86, c->x86_model); + } + +#ifdef CONFIG_X86_64 + detect_ht(c); +#endif + + init_hypervisor(c); + x86_init_rdrand(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if (c != &boot_cpu_data) { + /* AND the already accumulated flags with these */ + for (i = 0; i < NCAPINTS; i++) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + + /* Init Machine Check Exception if available. */ + mcheck_cpu_init(c); + + select_idle_routine(c); + +#ifdef CONFIG_NUMA + numa_add_cpu(smp_processor_id()); +#endif +} + +#ifdef CONFIG_X86_64 +static void vgetcpu_set_mode(void) +{ + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; +} +#endif + +void __init identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); + init_amd_e400_c1e_mask(); +#ifdef CONFIG_X86_32 + sysenter_setup(); + enable_sep_cpu(); +#else + vgetcpu_set_mode(); +#endif +} + +#ifdef CONFIG_XEN +void set_perf_event_pending(void) {} +#endif + +void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); +#ifdef CONFIG_X86_32 + enable_sep_cpu(); +#endif + mtrr_ap_init(); +} + +struct msr_range { + unsigned min; + unsigned max; +}; + +static const struct msr_range msr_range_array[] __cpuinitconst = { + { 0x00000000, 0x00000418}, + { 0xc0000000, 0xc000040b}, + { 0xc0010000, 0xc0010142}, + { 0xc0011000, 0xc001103b}, +}; + +static void __cpuinit print_cpu_msr(void) +{ + unsigned index_min, index_max; + unsigned index; + u64 val; + int i; + + for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { + index_min = msr_range_array[i].min; + index_max = msr_range_array[i].max; + + for (index = index_min; index < index_max; index++) { + if (rdmsrl_amd_safe(index, &val)) + continue; + printk(KERN_INFO " MSR%08x: %016llx\n", index, val); + } + } +} + +static int show_msr __cpuinitdata; + +static __init int setup_show_msr(char *arg) +{ + int num; + + get_option(&arg, &num); + + if (num > 0) + show_msr = num; + return 1; +} +__setup("show_msr=", setup_show_msr); + +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); + +void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) +{ + const char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) { + vendor = this_cpu->c_vendor; + } else { + if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + } + + if (vendor && !strstr(c->x86_model_id, vendor)) + printk(KERN_CONT "%s ", vendor); + + if (c->x86_model_id[0]) + printk(KERN_CONT "%s", c->x86_model_id); + else + printk(KERN_CONT "%d86", c->x86); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(KERN_CONT " stepping %02x\n", c->x86_mask); + else + printk(KERN_CONT "\n"); + +#ifdef CONFIG_SMP + if (c->cpu_index < show_msr) + print_cpu_msr(); +#else + if (show_msr) + print_cpu_msr(); +#endif +} + +static __init int setup_disablecpuid(char *arg) +{ + int bit; + + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + +#ifdef CONFIG_X86_64 +#ifndef CONFIG_X86_NO_IDT +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; +#endif + +DEFINE_PER_CPU_FIRST(union irq_stack_union, + irq_stack_union) __aligned(PAGE_SIZE); + +void xen_switch_pt(void) +{ +#ifdef CONFIG_XEN + xen_pt_switch(init_level4_pgt); +#endif +} + +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(unsigned long, kernel_stack) = + (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); + +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + +DEFINE_PER_CPU(unsigned int, irq_count) = -1; + +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); + +#ifndef CONFIG_X86_NO_TSS +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; + +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +void __cpuinit syscall_init(void) +{ +#ifndef CONFIG_XEN + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); +#endif + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); +#elif defined(CONFIG_XEN) + static const struct callback_register __cpuinitconst cstar = { + .type = CALLBACKTYPE_syscall32, + .address = (unsigned long)ignore_sysret + }; + + if (HYPERVISOR_callback_op(CALLBACKOP_register, &cstar)) + printk(KERN_WARNING "Unable to register CSTAR callback\n"); +#endif + +#ifndef CONFIG_XEN + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); +#endif +} + +unsigned long kernel_eflags; + +#ifndef CONFIG_X86_NO_TSS +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); +#endif + +#ifndef CONFIG_X86_NO_IDT +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); + +int is_debug_stack(unsigned long addr) +{ + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} +#endif + +#else /* CONFIG_X86_64 */ + +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); + +#ifdef CONFIG_CC_STACKPROTECTOR +DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +#endif + +/* Make sure %fs and %gs are initialized properly in idle threads */ +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + regs->fs = __KERNEL_PERCPU; + regs->gs = __KERNEL_STACK_CANARY; + + return regs; +} +#endif /* CONFIG_X86_64 */ + +/* + * Clear all 6 debug registers: + */ +static void clear_all_debug_regs(void) +{ + int i; + + for (i = 0; i < 8; i++) { + /* Ignore db4, db5 */ + if ((i == 4) || (i == 5)) + continue; + + set_debugreg(0, i); + } +} + +#ifdef CONFIG_KGDB +/* + * Restore debug regs if using kgdbwait and you have a kernel debugger + * connection established. + */ +static void dbg_restore_debug_regs(void) +{ + if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break)) + arch_kgdb_ops.correct_hw_break(); +} +#else /* ! CONFIG_KGDB */ +#define dbg_restore_debug_regs() +#endif /* ! CONFIG_KGDB */ + +#ifndef CONFIG_XEN +/* + * Prints an error where the NUMA and configured core-number mismatch and the + * platform didn't override this to fix it up + */ +void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +} +#endif + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init for 64 bit + */ +#ifdef CONFIG_X86_64 + +void __cpuinit cpu_init(void) +{ +#ifndef CONFIG_X86_NO_TSS + struct orig_ist *oist; + struct tss_struct *t; + unsigned long v; + int i; +#endif + struct task_struct *me; + int cpu; + + cpu = stack_smp_processor_id(); + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) + xen_switch_pt(); +#ifndef CONFIG_X86_NO_TSS + t = &per_cpu(init_tss, cpu); + oist = &per_cpu(orig_ist, cpu); +#endif + +#ifdef CONFIG_NUMA + if (cpu != 0 && percpu_read(numa_node) == 0 && + early_cpu_to_node(cpu) != NUMA_NO_NODE) + set_numa_node(early_cpu_to_node(cpu)); +#endif + + me = current; + + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) + panic("CPU#%d already initialized!\n", cpu); + + pr_debug("Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(cpu); + loadsegment(fs, 0); + +#ifndef CONFIG_X86_NO_IDT + load_idt((const struct desc_ptr *)&idt_descr); +#endif + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + x86_configure_nx(); +#ifdef CONFIG_X86_LOCAL_APIC + if (cpu != 0) + enable_x2apic(); +#endif + +#ifndef CONFIG_X86_NO_TSS + /* + * set up and load the per-CPU TSS + */ + if (!oist->ist[0]) { + char *estacks = per_cpu(exception_stacks, cpu); + + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + estacks += exception_stack_sizes[v]; + oist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; +#ifndef CONFIG_X86_NO_IDT + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; +#endif + } + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; +#endif + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + BUG_ON(me->mm); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); +#ifndef CONFIG_X86_NO_TSS + set_tss_desc(cpu, t); + load_TR_desc(); +#endif + load_LDT(&init_mm.context); + + clear_all_debug_regs(); + dbg_restore_debug_regs(); + + fpu_init(); + xsave_init(); + +#ifndef CONFIG_XEN + raw_local_save_flags(kernel_eflags); +#else + asm ("pushfq; popq %0" : "=rm" (kernel_eflags)); + if (raw_irqs_disabled()) + kernel_eflags &= ~X86_EFLAGS_IF; +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + if (is_uv_system()) + uv_cpu_init(); +#endif +} + +#else + +void __cpuinit cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *t = &per_cpu(init_tss, cpu); +#endif + struct thread_struct *thread = &curr->thread; + + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) + local_irq_enable(); + } + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + switch_to_new_gdt(cpu); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + curr->active_mm = &init_mm; + BUG_ON(curr->mm); + enter_lazy_tlb(&init_mm, curr); + + load_sp0(t, thread); + + load_LDT(&init_mm.context); + +#ifndef CONFIG_X86_NO_TSS + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); +#endif + +#ifdef CONFIG_DOUBLEFAULT + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); +#endif + + clear_all_debug_regs(); + dbg_restore_debug_regs(); + + fpu_init(); + xsave_init(); +} +#endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3e6ff6c..c55ce62 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -36,10 +36,15 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { +#ifndef CONFIG_XEN misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); get_cpu_cap(c); +#else + pr_warning("CPUID levels are restricted -" + " update hypervisor\n"); +#endif } } @@ -47,6 +52,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifndef CONFIG_XEN if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { unsigned lower_word; @@ -69,6 +75,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); clear_cpu_cap(c, X86_FEATURE_PSE); } +#endif #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); @@ -93,8 +100,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); +#ifndef CONFIG_XEN if (!check_tsc_unstable()) sched_clock_stable = 1; +#endif } /* @@ -238,9 +247,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); +#ifndef CONFIG_XEN printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); +#else + pr_warning("CPU: Hypervisor update needed\n"); +#endif } } @@ -285,6 +298,7 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) } #endif +#ifndef CONFIG_XEN static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { #ifdef CONFIG_NUMA @@ -357,6 +371,7 @@ static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_VPID); } } +#endif static void __cpuinit init_intel(struct cpuinfo_x86 *c) { @@ -440,6 +455,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P3); #endif +#ifndef CONFIG_XEN if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { /* * let's use the legacy cpuid vector 0x1 and 0x4 for topology @@ -456,6 +472,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); +#endif /* * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not. diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6b45e5e..2c06ab4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -279,8 +279,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; eax->split.num_threads_sharing = 0; +#ifndef CONFIG_XEN eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; - +#endif if (assoc == 0xffff) eax->split.is_fully_associative = 1; @@ -298,7 +299,7 @@ struct _cache_attr { unsigned int); }; -#ifdef CONFIG_AMD_NB +#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN) /* * L3 cache descriptors @@ -579,8 +580,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ - unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; #ifdef CONFIG_X86_HT + unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; unsigned int cpu = c->cpu_index; #endif @@ -614,16 +615,20 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) break; case 2: new_l2 = this_leaf.size/1024; +#ifdef CONFIG_X86_HT num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l2_id = c->apicid >> index_msb; +#endif break; case 3: new_l3 = this_leaf.size/1024; +#ifdef CONFIG_X86_HT num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order( num_threads_sharing); l3_id = c->apicid >> index_msb; +#endif break; default: break; @@ -724,7 +729,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; @@ -954,7 +959,7 @@ static struct attribute *default_attrs[] = { NULL }; -#ifdef CONFIG_AMD_NB +#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN) static struct attribute ** __cpuinit amd_l3_attrs(void) { static struct attribute **attrs; @@ -1100,7 +1105,7 @@ static int __cpuinit cache_add_dev(struct device *dev) this_leaf = CPUID4_INFO_IDX(cpu, i); ktype_cache.default_attrs = default_attrs; -#ifdef CONFIG_AMD_NB +#if defined(CONFIG_AMD_NB) && !defined(CONFIG_XEN) if (this_leaf->base.nb) ktype_cache.default_attrs = amd_l3_attrs(); #endif diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index bb34b03..21e0a8a 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -3,6 +3,7 @@ obj-y = mce.o mce-severity.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o +obj-$(CONFIG_X86_XEN_MCE) += mce_dom0.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index fc4beb3..abdbadd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -93,6 +93,7 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) return NMI_HANDLED; } +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) static void mce_irq_ipi(void *info) { int cpu = smp_processor_id(); @@ -104,6 +105,7 @@ static void mce_irq_ipi(void *info) raise_exception(m, NULL); } } +#endif /* Inject mce on current CPU */ static int raise_local(void) @@ -151,7 +153,7 @@ static void raise_mce(struct mce *m) if (context == MCJ_CTX_RANDOM) return; -#ifdef CONFIG_X86_LOCAL_APIC +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_XEN) if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a11ae2..4457a8f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -118,8 +118,10 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); +#ifndef CONFIG_XEN m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; +#endif rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } @@ -266,9 +268,14 @@ static void print_mce(struct mce *m) * Note this output is parsed by external tools and old fields * should not be changed. */ +#ifndef CONFIG_XEN pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, cpu_data(m->extcpu).microcode); +#else + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); +#endif /* * Print out human-readable details about the MCE error, @@ -1153,8 +1160,15 @@ void mce_log_therm_throt_event(__u64 status) * Periodic polling timer for "silent" machine check errors. If the * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). + * + * We will disable polling in DOM0 since all CMCI/Polling + * mechanism will be done in XEN for Intel CPUs */ +#if defined (CONFIG_X86_XEN_MCE) +static int check_interval = 0; /* disable polling */ +#else static int check_interval = 5 * 60; /* 5 minutes */ +#endif static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); @@ -1329,6 +1343,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { +#ifndef CONFIG_XEN if (c->x86 == 15 && banks > 4) { /* * disable GART TBL walk error reporting, which @@ -1337,6 +1352,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } +#endif if (c->x86 <= 17 && mce_bootlog < 0) { /* * Lots of broken BIOS around that don't clear them @@ -1409,6 +1425,7 @@ static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { +#ifndef CONFIG_X86_64_XEN switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); @@ -1419,6 +1436,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) default: break; } +#endif } static void __mcheck_cpu_init_timer(void) @@ -2201,6 +2219,16 @@ static __init int mcheck_init_device(void) /* register character device /dev/mcelog */ misc_register(&mce_chrdev_device); +#ifdef CONFIG_X86_XEN_MCE + if (is_initial_xendomain()) { + /* Register vIRQ handler for MCE LOG processing */ + extern int bind_virq_for_mce(void); + + printk(KERN_DEBUG "MCE: bind virq for DOM0 logging\n"); + bind_virq_for_mce(); + } +#endif + return err; } device_initcall(mcheck_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_dom0.c b/arch/x86/kernel/cpu/mcheck/mce_dom0.c new file mode 100644 index 0000000..b6d5c3e --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_dom0.c @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static xen_mc_logical_cpu_t *g_physinfo; +static unsigned int ncpus; + +static int convert_log(struct mc_info *mi) +{ + struct mcinfo_common *mic = NULL; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + struct mce m; + unsigned int i; + bool found = false; + + x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); + if (mic == NULL) + { + pr_err("DOM0_MCE_LOG: global data is NULL\n"); + return -1; + } + + mce_setup(&m); + mc_global = (struct mcinfo_global*)mic; + m.mcgstatus = mc_global->mc_gstatus; + m.apicid = mc_global->mc_apicid; + + for (i = 0; i < ncpus; i++) + if (g_physinfo[i].mc_apicid == m.apicid) { + found = true; + break; + } + WARN_ON_ONCE(!found); + m.socketid = mc_global->mc_socketid; + m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; + m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; + + x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); + do + { + if (mic == NULL || mic->size == 0) + break; + if (mic->type == MC_TYPE_BANK) + { + mc_bank = (struct mcinfo_bank*)mic; + m.misc = mc_bank->mc_misc; + m.status = mc_bank->mc_status; + m.addr = mc_bank->mc_addr; + m.tsc = mc_bank->mc_tsc; + m.bank = mc_bank->mc_bank; + printk(KERN_DEBUG "[CPU%d, BANK%d, addr %llx, state %llx]\n", + m.bank, m.cpu, m.addr, m.status); + /*log this record*/ + mce_log(&m); + } + mic = x86_mcinfo_next(mic); + }while (1); + + return 0; +} + +static struct mc_info *g_mi; + +/*dom0 mce virq handler, logging physical mce error info*/ + +static irqreturn_t mce_dom0_interrupt(int irq, void *dev_id) +{ + xen_mc_t mc_op; + int result = 0; + + printk(KERN_DEBUG "MCE_DOM0_LOG: enter dom0 mce vIRQ handler\n"); + mc_op.cmd = XEN_MC_fetch; + set_xen_guest_handle(mc_op.u.mc_fetch.data, g_mi); +urgent: + mc_op.u.mc_fetch.flags = XEN_MC_URGENT; + result = HYPERVISOR_mca(&mc_op); + if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA || + mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) + { + printk(KERN_DEBUG "MCE_DOM0_LOG: No more urgent data\n"); + goto nonurgent; + } + else + { + result = convert_log(g_mi); + if (result) { + pr_err("MCE_DOM0_LOG: Log conversion failed\n"); + goto end; + } + /* After fetching the telem from DOM0, we need to dec the telem's + * refcnt and release the entry. The telem is reserved and inc + * refcnt when filling the telem. + */ + mc_op.u.mc_fetch.flags = XEN_MC_URGENT | XEN_MC_ACK; + result = HYPERVISOR_mca(&mc_op); + + goto urgent; + } +nonurgent: + mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT; + result = HYPERVISOR_mca(&mc_op); + if (result || mc_op.u.mc_fetch.flags & XEN_MC_NODATA || + mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) + { + printk(KERN_DEBUG "MCE_DOM0_LOG: No more nonurgent data\n"); + goto end; + } + else + { + result = convert_log(g_mi); + if (result) { + pr_err("MCE_DOM0_LOG: Log conversion failed\n"); + goto end; + } + /* After fetching the telem from DOM0, we need to dec the telem's + * refcnt and release the entry. The telem is reserved and inc + * refcnt when filling the telem. + */ + mc_op.u.mc_fetch.flags = XEN_MC_NONURGENT | XEN_MC_ACK; + result = HYPERVISOR_mca(&mc_op); + + goto nonurgent; + } +end: + return IRQ_HANDLED; +} + +int __init bind_virq_for_mce(void) +{ + int ret; + xen_mc_t mc_op; + + g_mi = kmalloc(sizeof(*g_mi), GFP_KERNEL); + if (!g_mi) + return -ENOMEM; + + /* fetch physical CPU count */ + mc_op.cmd = XEN_MC_physcpuinfo; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, NULL); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("MCE: Failed to get physical CPU count\n"); + kfree(g_mi); + return ret; + } + + /* fetch CPU physical info for later reference */ + ncpus = mc_op.u.mc_physcpuinfo.ncpus; + g_physinfo = kmalloc(sizeof(*g_physinfo) * ncpus, GFP_KERNEL); + if (!g_physinfo) { + kfree(g_mi); + return -ENOMEM; + } + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("MCE: Failed to get physical CPUs' info\n"); + kfree(g_mi); + kfree(g_physinfo); + return ret; + } + + ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, + mce_dom0_interrupt, 0, "mce", NULL); + + if (ret < 0) { + pr_err("MCE: Failed to bind vIRQ for Dom0\n"); + kfree(g_mi); + kfree(g_physinfo); + return ret; + } + + /* Log the machine checks left over from the previous reset. */ + mce_dom0_interrupt(VIRQ_MCA, NULL); + + return 0; +} + diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index ad9e5ed..b854116 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,4 @@ obj-y := main.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o +obj-$(CONFIG_XEN) := main.o if.o diff --git a/arch/x86/kernel/cpu/mtrr/main-xen.c b/arch/x86/kernel/cpu/mtrr/main-xen.c new file mode 100644 index 0000000..013e120 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/main-xen.c @@ -0,0 +1,326 @@ +#define DEBUG + +#include +#include +#include +#include + +#include +#include "mtrr.h" + +static DEFINE_MUTEX(mtrr_mutex); + +void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + struct xen_platform_op op; + + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = reg; + if (unlikely(HYPERVISOR_platform_op(&op))) + memset(&op.u.read_memtype, 0, sizeof(op.u.read_memtype)); + + *size = op.u.read_memtype.nr_mfns; + *base = op.u.read_memtype.mfn; + *type = op.u.read_memtype.type; +} + +const struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .get = generic_get_mtrr, +}; + +const struct mtrr_ops *mtrr_if = &generic_mtrr_ops; +unsigned int num_var_ranges; +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; + +static u64 tom2; + +static void __init set_num_var_ranges(void) +{ + struct xen_platform_op op; + + for (num_var_ranges = 0; ; num_var_ranges++) { + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = num_var_ranges; + if (HYPERVISOR_platform_op(&op) != 0) + break; + } +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + for (i = 0; i < max; i++) + mtrr_usage_table[i] = 0; +} + +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, bool increment) +{ + int error; + struct xen_platform_op op; + + mutex_lock(&mtrr_mutex); + + op.cmd = XENPF_add_memtype; + op.u.add_memtype.mfn = base; + op.u.add_memtype.nr_mfns = size; + op.u.add_memtype.type = type; + error = HYPERVISOR_platform_op(&op); + if (error) { + mutex_unlock(&mtrr_mutex); + BUG_ON(error > 0); + return error; + } + + if (increment) + ++mtrr_usage_table[op.u.add_memtype.reg]; + + mutex_unlock(&mtrr_mutex); + + return op.u.add_memtype.reg; +} + +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) +{ + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} +EXPORT_SYMBOL(mtrr_add); + +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + unsigned i; + mtrr_type ltype; + unsigned long lbase, lsize; + int error = -EINVAL; + struct xen_platform_op op; + + mutex_lock(&mtrr_mutex); + + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); + goto out; + } + } + if (mtrr_usage_table[reg] < 1) { + pr_warning("mtrr: reg: %d has count=0\n", reg); + goto out; + } + if (--mtrr_usage_table[reg] < 1) { + op.cmd = XENPF_del_memtype; + op.u.del_memtype.handle = 0; + op.u.del_memtype.reg = reg; + error = HYPERVISOR_platform_op(&op); + if (error) { + BUG_ON(error > 0); + goto out; + } + } + error = reg; + out: + mutex_unlock(&mtrr_mutex); + return error; +} + +int mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} +EXPORT_SYMBOL(mtrr_del); + +/* + * Returns the effective MTRR type for the region + * Error returns: + * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR + * - 0xFF - when MTRR is not enabled + */ +u8 mtrr_type_lookup(u64 start, u64 end) +{ + int i, error; + u64 start_mfn, end_mfn, base_mfn, top_mfn; + u8 prev_match, curr_match; + struct xen_platform_op op; + + if (!is_initial_xendomain()) + return MTRR_TYPE_WRBACK; + + if (!num_var_ranges) + return 0xFF; + + start_mfn = start >> PAGE_SHIFT; + /* Make end inclusive end, instead of exclusive */ + end_mfn = --end >> PAGE_SHIFT; + + /* Look in fixed ranges. Just return the type as per start */ + if (start_mfn < 0x100) { +#if 0//todo + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = ???; + error = HYPERVISOR_platform_op(&op); + if (!error) + return op.u.read_memtype.type; +#endif + return MTRR_TYPE_UNCACHABLE; + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + op.cmd = XENPF_read_memtype; + op.u.read_memtype.reg = i; + error = HYPERVISOR_platform_op(&op); + + if (error || !op.u.read_memtype.nr_mfns) + continue; + + base_mfn = op.u.read_memtype.mfn; + top_mfn = base_mfn + op.u.read_memtype.nr_mfns - 1; + + if (base_mfn > end_mfn || start_mfn > top_mfn) { + continue; + } + + if (base_mfn > start_mfn || end_mfn > top_mfn) { + return 0xFE; + } + + curr_match = op.u.read_memtype.type; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) { + return MTRR_TYPE_UNCACHABLE; + } + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) { + return MTRR_TYPE_UNCACHABLE; + } + } + + if (tom2) { + if (start >= (1ULL<<32) && (end < tom2)) + return MTRR_TYPE_WRBACK; + } + + if (prev_match != 0xFF) + return prev_match; + +#if 0//todo + op.cmd = XENPF_read_def_memtype; + error = HYPERVISOR_platform_op(&op); + if (!error) + return op.u.read_def_memtype.type; +#endif + return MTRR_TYPE_UNCACHABLE; +} + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +static int __init _amd_special_default_mtrr(void) +{ + u32 l, h; + + if (!is_initial_xendomain()) + return 0; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +void __init mtrr_bp_init(void) +{ + if (_amd_special_default_mtrr()) { + /* TOP_MEM2 */ + rdmsrl(MSR_K8_TOP_MEM2, tom2); + tom2 &= 0xffffff8000000ULL; + } +} + +void mtrr_ap_init(void) +{ +} + +static int __init mtrr_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if (!is_initial_xendomain()) + return -ENODEV; + + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; + + set_num_var_ranges(); + init_table(); + + return 0; +} + +subsys_initcall(mtrr_init); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 8022c66..845e3bd 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -10,7 +10,7 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) if (c->x86_max_cores * smp_num_siblings > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", @@ -32,18 +32,22 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) */ int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu); seq_printf(m, +#ifndef CONFIG_XEN "fdiv_bug\t: %s\n" "hlt_bug\t\t: %s\n" "f00f_bug\t: %s\n" "coma_bug\t: %s\n" +#endif "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: %s\n", +#ifndef CONFIG_XEN c->fdiv_bug ? "yes" : "no", c->hlt_works_ok ? "no" : "yes", c->f00f_bug ? "yes" : "no", c->coma_bug ? "yes" : "no", +#endif c->hard_math ? "yes" : "no", fpu_exception ? "yes" : "no", c->cpuid_level, @@ -83,8 +87,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); +#ifndef CONFIG_XEN if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); +#endif if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get(cpu); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index addf9e8..a1c53de 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -41,6 +41,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, +#ifndef CONFIG_XEN { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, @@ -51,6 +52,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 }, { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 }, { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 }, +#endif { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 4397e98..dc581ec 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -28,7 +28,7 @@ */ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && !defined(CONFIG_XEN) unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index caf5de0..d9b4f99 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -21,6 +21,7 @@ #define N_EXCEPTION_STACKS_END \ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) +#ifndef CONFIG_X86_NO_TSS static char x86_stack_ids[][8] = { [ DEBUG_STACK-1 ] = "#DB", [ NMI_STACK-1 ] = "NMI", @@ -32,10 +33,12 @@ static char x86_stack_ids[][8] = { N_EXCEPTION_STACKS_END ] = "#DB[?]" #endif }; +#endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) { +#ifndef CONFIG_X86_NO_TSS unsigned k; /* @@ -95,6 +98,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, } #endif } +#endif /* CONFIG_X86_NO_TSS */ return NULL; } diff --git a/arch/x86/kernel/e820-xen.c b/arch/x86/kernel/e820-xen.c new file mode 100644 index 0000000..481afb7 --- /dev/null +++ b/arch/x86/kernel/e820-xen.c @@ -0,0 +1,1291 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * Venkatesh Pallipadi + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * The e820 map is the map that gets modified e.g. with command line parameters + * and that is also registered with modifications in the kernel resource tree + * with the iomem_resource as parent. + * + * The e820_saved is directly saved after the BIOS-provided memory map is + * copied. It doesn't get modified afterwards. It's registered for the + * /sys/firmware/memmap interface. + * + * That memory map is not modified and is used as base for kexec. The kexec'd + * kernel should get the same memory map as the firmware provides. Then the + * user can e.g. boot the original kernel with mem=1G while still booting the + * next kernel with full memory. + */ +struct e820map e820; +#if !defined(CONFIG_XEN) +struct e820map e820_saved; +#elif defined(CONFIG_XEN_PRIVILEGED_GUEST) +struct e820map machine_e820; +# define e820_saved machine_e820 +#else +# define machine_e820 e820 +# define e820_saved e820 +#endif + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0xaeedbabe; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; ++i) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + +/* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(u64 start, u64 end, unsigned type) +{ + int i; + +#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; +#else + if (!is_initial_xendomain()) + return 0; + for (i = 0; i < machine_e820.nr_map; ++i) { + const struct e820entry *ei = &machine_e820.map[i]; +#endif + + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* + * if start is now at or beyond end, we're done, full + * coverage + */ + if (start >= end) + return 1; + } + return 0; +} + +/* + * Add a memory region to the kernel e820 map. + */ +static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, + int type) +{ + int x = e820x->nr_map; + + if (x >= ARRAY_SIZE(e820x->map)) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820x->map[x].addr = start; + e820x->map[x].size = size; + e820x->map[x].type = type; + e820x->nr_map++; +} + +void __init e820_add_region(u64 start, u64 size, int type) +{ + __e820_add_region(&e820, start, size, type); +} + +static void __init e820_print_type(u32 type) +{ + switch (type) { + case E820_RAM: + case E820_RESERVED_KERN: + printk(KERN_CONT "(usable)"); + break; + case E820_RESERVED: + printk(KERN_CONT "(reserved)"); + break; + case E820_ACPI: + printk(KERN_CONT "(ACPI data)"); + break; + case E820_NVS: + printk(KERN_CONT "(ACPI NVS)"); + break; + case E820_UNUSABLE: + printk(KERN_CONT "(unusable)"); + break; + default: + printk(KERN_CONT "type %u", type); + break; + } +} + +static void __init _e820_print_map(const struct e820map *e820, const char *who) +{ + int i; + + for (i = 0; i < e820->nr_map; i++) { + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, + (unsigned long long) e820->map[i].addr, + (unsigned long long) + (e820->map[i].addr + e820->map[i].size)); + e820_print_type(e820->map[i].type); + printk(KERN_CONT "\n"); + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps, + * and resolving conflicting memory types in favor of highest + * numbered type. + * + * The input parameter biosmap points to an array of 'struct + * e820entry' which on entry has elements in the range [0, *pnr_map) + * valid, and which has space for up to max_nr_map entries. + * On return, the resulting sanitized e820 map entries will be in + * overwritten in the same location, starting at biosmap. + * + * The integer pointed to by pnr_map must be valid on entry (the + * current number of valid entries located at biosmap) and will + * be updated on return, with the new number of valid entries + * (something no more than max_nr_map.) + * + * The return value from sanitize_e820_map() is zero if it + * successfully 'sanitized' the map entries passed in, and is -1 + * if it did nothing, which can happen if either of (1) it was + * only passed one map entry, or (2) any of the input map entries + * were invalid (start + size < start, meaning that the size was + * so big the described memory range wrapped around through zero.) + * + * Visually we're performing the following + * (1,2,3,4 = memory types)... + * + * Sample memory map (w/overlaps): + * ____22__________________ + * ______________________4_ + * ____1111________________ + * _44_____________________ + * 11111111________________ + * ____________________33__ + * ___________44___________ + * __________33333_________ + * ______________22________ + * ___________________2222_ + * _________111111111______ + * _____________________11_ + * _________________4______ + * + * Sanitized equivalent (no overlap): + * 1_______________________ + * _44_____________________ + * ___1____________________ + * ____22__________________ + * ______11________________ + * _________1______________ + * __________3_____________ + * ___________44___________ + * _____________33_________ + * _______________2________ + * ________________1_______ + * _________________4______ + * ___________________2____ + * ____________________33__ + * ______________________4_ + */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; + +static int __init cpcompare(const void *a, const void *b) +{ + struct change_member * const *app = a, * const *bpp = b; + const struct change_member *ap = *app, *bp = *bpp; + + /* + * Inputs are pointers to two elements of change_point[]. If their + * addresses are unequal, their difference dominates. If the addresses + * are equal, then consider one that represents the end of its region + * to be greater than one that does not. + */ + if (ap->addr != bp->addr) + return ap->addr > bp->addr ? 1 : -1; + + return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); +} + +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, + u32 *pnr_map) +{ + static struct change_member change_point_list[2*E820_X_MAX] __initdata; + static struct change_member *change_point[2*E820_X_MAX] __initdata; + static struct e820entry *overlap_list[E820_X_MAX] __initdata; + static struct e820entry new_bios[E820_X_MAX] __initdata; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* if there's only one memory region, don't bother */ +#ifdef CONFIG_XEN + if (*pnr_map == 1) + return 0; +#endif + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + BUG_ON(old_nr > max_nr_map); + + /* bail out if we find any unreasonable addresses in bios map */ + for (i = 0; i < old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i = 0; i < 2 * old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i = 0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; + + /* sort change-point list by memory addresses (low -> high) */ + sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); + + /* create a new bios memory map, removing overlaps */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + + /* loop through change-points, determining affect on the new bios map */ + for (chgidx = 0; chgidx < chg_nr; chgidx++) { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ + current_type = 0; + for (i = 0; i < overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* + * continue building up new bios map based on this + * information + */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* + * move forward only if the new size + * was non-zero + */ + if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ + if (++new_bios_entry >= max_nr_map) + break; + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr = change_point[chgidx]->addr; + } + last_type = current_type; + } + } + /* retain count for new bios entries */ + new_nr = new_bios_entry; + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) +{ + while (nr_map) { + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + e820_add_region(start, size, type); + + biosmap++; + nr_map--; + } + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +static int __init append_e820_map(struct e820entry *biosmap, int nr_map) +{ +#ifndef CONFIG_XEN + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; +#else + BUG_ON(nr_map < 1); +#endif + + return __append_e820_map(biosmap, nr_map); +} + +static u64 __init __e820_update_range(struct e820map *e820x, u64 start, + u64 size, unsigned old_type, + unsigned new_type) +{ + u64 end; + unsigned int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT " ==> "); + e820_print_type(new_type); + printk(KERN_CONT "\n"); + + for (i = 0; i < e820x->nr_map; i++) { + struct e820entry *ei = &e820x->map[i]; + u64 final_start, final_end; + u64 ei_end; + + if (ei->type != old_type) + continue; + + ei_end = ei->addr + ei->size; + /* totally covered by new range? */ + if (ei->addr >= start && ei_end <= end) { + ei->type = new_type; + real_updated_size += ei->size; + continue; + } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + __e820_add_region(e820x, start, size, new_type); + __e820_add_region(e820x, end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_updated_size += size; + continue; + } + + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(end, ei_end); + if (final_start >= final_end) + continue; + + __e820_add_region(e820x, final_start, final_end - final_start, + new_type); + + real_updated_size += final_end - final_start; + + /* + * left range could be head or tail, so need to update + * size at first. + */ + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; + ei->addr = final_end; + } + return real_updated_size; +} + +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + return __e820_update_range(&e820, start, size, old_type, new_type); +} + +#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST +static u64 __init e820_update_range_saved(u64 start, u64 size, + unsigned old_type, unsigned new_type) +{ +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return 0; + return __e820_update_range(&machine_e820, phys_to_machine(start), + size, old_type, new_type); +#else + return __e820_update_range(&e820_saved, start, size, old_type, + new_type); +#endif +} +#endif + +/* make e820 not cover the range */ +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, + int checktype) +{ + int i; + u64 end; + u64 real_removed_size = 0; + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + if (checktype) + e820_print_type(old_type); + printk(KERN_CONT "\n"); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + u64 ei_end; + + if (checktype && ei->type != old_type) + continue; + + ei_end = ei->addr + ei->size; + /* totally covered? */ + if (ei->addr >= start && ei_end <= end) { + real_removed_size += ei->size; + memset(ei, 0, sizeof(struct e820entry)); + continue; + } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + e820_add_region(end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_removed_size += size; + continue; + } + + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(end, ei_end); + if (final_start >= final_end) + continue; + real_removed_size += final_end - final_start; + + /* + * left range could be head or tail, so need to update + * size at first. + */ + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; + ei->addr = final_end; + } + return real_removed_size; +} + +void __init update_e820(void) +{ + u32 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + _e820_print_map(&e820, "modified"); +} +#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST +static void __init update_e820_saved(void) +{ + u32 nr_map; + + nr_map = e820_saved.nr_map; + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) + return; + e820_saved.nr_map = nr_map; +} +#endif + +#ifdef CONFIG_XEN +#define e820 machine_e820 +#endif + +#define MAX_GAP_END 0x100000000ull +/* + * Search for a gap in the e820 memory space from start_addr to end_addr. + */ +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, + unsigned long start_addr, unsigned long long end_addr) +{ + unsigned long long last; + int i = e820.nr_map; + int found = 0; + + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; +#ifdef CONFIG_X86_64 + if (start_addr >= MAX_GAP_END) + last = end_addr ?: (1UL << boot_cpu_data.x86_phys_bits); +#endif + + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + if (end < start_addr) + continue; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap >= *gapsize) { + *gapsize = gap; + *gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + return found; +} + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize; + int found; + + gapstart = 0x10000000; + gapsize = 0x400000; + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END); + +#ifdef CONFIG_X86_64 + if (!found) { + printk(KERN_ERR + "PCI: Warning: Cannot find a gap in the 32bit address range\n" + "PCI: Unassigned devices with 32bit resource registers may break!\n"); + found = e820_search_gap(&gapstart, &gapsize, MAX_GAP_END, 0); + WARN_ON(!found); + } +#endif + + /* + * e820_reserve_resources_late protect stolen RAM already + */ + pci_mem_start = gapstart; + + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); +} + +#undef e820 + +#ifndef CONFIG_XEN +/** + * Because of the size limitation of struct boot_params, only first + * 128 E820 memory entries are passed to kernel via + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of + * linked list of struct setup_data, which is parsed here. + */ +void __init parse_e820_ext(struct setup_data *sdata) +{ + int entries; + struct e820entry *extmap; + + entries = sdata->len / sizeof(struct e820entry); + extmap = (struct e820entry *)(sdata->data); + __append_e820_map(extmap, entries); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + printk(KERN_INFO "extended physical RAM map:\n"); + _e820_print_map(&e820, "extended"); +} + +#if defined(CONFIG_X86_64) || \ + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) +/** + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(unsigned long limit_pfn) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= limit_pfn) + break; + } +} +#endif + +#ifdef CONFIG_ACPI +/** + * Mark ACPI NVS memory region, so that we can save/restore it during + * hibernation and the subsequent resume. + */ +static int __init e820_mark_nvs_memory(void) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->type == E820_NVS) + acpi_nvs_register(ei->addr, ei->size); + } + + return 0; +} +core_initcall(e820_mark_nvs_memory); +#endif +#endif + +#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST +/* + * pre allocated 4k and reserved it in memblock and e820_saved + */ +u64 __init early_reserve_e820(u64 size, u64 align) +{ + u64 addr; +#ifdef CONFIG_XEN + unsigned int order = get_order(size); + int rc; + unsigned long max_initmap_pfn; + + if (!is_initial_xendomain()) + return 0; + size = PAGE_SIZE << order; + if (align < PAGE_SIZE) + align = PAGE_SIZE; +#endif + addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr) { + e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); + update_e820_saved(); + } +#ifdef CONFIG_XEN + else + return 0; + max_initmap_pfn = ALIGN(PFN_UP(__pa(xen_start_info->pt_base)) + + xen_start_info->nr_pt_frames + + 1 + (1 << (19 - PAGE_SHIFT)), + 1UL << (22 - PAGE_SHIFT)); +#ifdef CONFIG_X86_32 + if ((addr >> PAGE_SHIFT) + < max(max_initmap_pfn, max_pfn_mapped)) + rc = xen_create_contiguous_region((unsigned long)__va(addr), + order, 32); +#else + if ((addr >> PAGE_SHIFT) < max_pfn_mapped) + rc = xen_create_contiguous_region((unsigned long)__va(addr), + order, 32); + else if ((addr >> PAGE_SHIFT) < max_initmap_pfn) + rc = xen_create_contiguous_region(__START_KERNEL_map + addr, + order, 32); +#endif + else + rc = early_create_contiguous_region(addr >> PAGE_SHIFT, + order, 32); + if (rc) + return 0; +#endif + + return addr; +} +#endif + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_PAE +# define MAX_ARCH_PFN (1ULL<<(40-PAGE_SHIFT)) +# else +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) +# endif +#else /* CONFIG_X86_32 */ +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT +#endif + +/* + * Find the highest page frame number we have available + */ +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +{ + int i; + unsigned long last_pfn = 0; + unsigned long max_arch_pfn = MAX_ARCH_PFN; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long start_pfn; + unsigned long end_pfn; + + if (ei->type != type) + continue; + + start_pfn = ei->addr >> PAGE_SHIFT; + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + + if (start_pfn >= limit_pfn) + continue; + if (end_pfn > limit_pfn) { + last_pfn = limit_pfn; + break; + } + if (end_pfn > last_pfn) + last_pfn = end_pfn; + } + + if (last_pfn > max_arch_pfn) + last_pfn = max_arch_pfn; + + printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", + last_pfn, max_arch_pfn); + return last_pfn; +} +unsigned long __init e820_end_of_ram_pfn(void) +{ + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); +} + +unsigned long __init e820_end_of_low_ram_pfn(void) +{ + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); +} + +static void early_panic(char *msg) +{ + early_printk(msg); + panic(msg); +} + +static int userdef __initdata; + +/* "mem=nopentium" disables the 4MB page tables. */ +static int __init parse_memopt(char *p) +{ + u64 mem_size, current_end; + unsigned int i; + + if (!p) + return -EINVAL; + +#ifndef CONFIG_XEN + if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_PSE); + return 0; +#else + printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; +#endif + } +#endif + + userdef = 1; + mem_size = memparse(p, &p); + /* don't remove all of memory when handling "mem={invalid}" param */ + if (mem_size == 0) + return -EINVAL; +#ifdef CONFIG_XEN + /* + * A little less than 2% of available memory are needed for page + * tables, p2m map, and mem_map. Hence the maximum amount of memory + * we can potentially balloon up to can in no case exceed about 50 + * times of what we've been given initially. Since even with that we + * won't be able to boot (due to various calculations done based on + * the total number of pages) we further restrict this to factor 32. + */ + if ((mem_size >> (PAGE_SHIFT + 5)) > xen_start_info->nr_pages) { + u64 size = (u64)xen_start_info->nr_pages << 5; + + pr_warn("mem=%Luk is invalid for an initial" + " allocation of %luk, using %Luk\n", + (unsigned long long)mem_size >> 10, + xen_start_info->nr_pages << (PAGE_SHIFT - 10), + (unsigned long long)size << (PAGE_SHIFT - 10)); + mem_size = size << PAGE_SHIFT; + } +#endif + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + i = e820.nr_map - 1; + current_end = e820.map[i].addr + e820.map[i].size; + if (current_end < mem_size) { + /* + * The e820 map ends before our requested size so + * extend the final entry to the requested address. + */ + if (e820.map[i].type == E820_RAM) + e820.map[i].size = mem_size - e820.map[i].addr; + else + e820_add_region(current_end, mem_size - current_end, E820_RAM); + } + + return 0; +} +early_param("mem", parse_memopt); + +#ifndef CONFIG_XEN +static int __init parse_memmap_opt(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!p) + return -EINVAL; + + if (!strncmp(p, "exactmap", 8)) { +#ifdef CONFIG_CRASH_DUMP + /* + * If we are doing a crash dump, we still need to know + * the real mem size before original memory map is + * reset. + */ + saved_max_pfn = e820_end_of_ram_pfn(); +#endif + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + userdef = 1; + if (*p == '@') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_RAM); + } else if (*p == '#') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_ACPI); + } else if (*p == '$') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_RESERVED); + } else + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); +#endif + +void __init finish_e820_parsing(void) +{ + if (userdef) { + u32 nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) + early_panic("Invalid user supplied memory map"); + e820.nr_map = nr; + + printk(KERN_INFO "user-defined physical RAM map:\n"); + _e820_print_map(&e820, "user"); + } +} + +static inline const char *e820_type_to_string(int e820_type) +{ + switch (e820_type) { + case E820_RESERVED_KERN: + case E820_RAM: return "System RAM"; + case E820_ACPI: return "ACPI Tables"; + case E820_NVS: return "ACPI Non-volatile Storage"; + case E820_UNUSABLE: return "Unusable memory"; + default: return "reserved"; + } +} + +#ifdef CONFIG_XEN +#define e820 machine_e820 +#endif + +/* + * Mark e820 reserved areas as busy for the resource manager. + */ +static struct resource __initdata *e820_res; +void __init e820_reserve_resources(void) +{ + int i; + struct resource *res; + u64 end; + + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); + e820_res = res; + for (i = 0; i < e820.nr_map; i++) { + end = e820.map[i].addr + e820.map[i].size - 1; + if (end != (resource_size_t)end) { + res++; + continue; + } + res->name = e820_type_to_string(e820.map[i].type); + res->start = e820.map[i].addr; + res->end = end; + + res->flags = IORESOURCE_MEM; + + /* + * don't register the region that could be conflicted with + * pci device BAR resource and insert them later in + * pcibios_resource_survey() + */ + if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { + if (e820.map[i].type != E820_NVS) + res->flags |= IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + } + res++; + } + + for (i = 0; i < e820_saved.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + firmware_map_add_early(entry->addr, + entry->addr + entry->size - 1, + e820_type_to_string(entry->type)); + } +} + +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 64MB for anything above that */ + return 64*1024*1024; +} + +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + +void __init e820_reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820.nr_map; i++) { + if (!res->parent && res->end) + insert_resource_expand_to_fit(&iomem_resource, res); + res++; + } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820.map[i]; + u64 start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) + continue; + printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", + start, end); + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); + } +} + +#undef e820 + +char *__init default_machine_specific_memory_setup(void) +{ + int rc, nr_map; + unsigned long maxmem; + struct xen_memory_map memmap; + static struct e820entry __initdata map[E820MAX]; + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc == -ENOSYS) { + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = PFN_PHYS((unsigned long long)xen_start_info->nr_pages); + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8ULL << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); + + nr_map = memmap.nr_entries; + sanitize_e820_map(map, ARRAY_SIZE(map), &nr_map); + + if (append_e820_map(map, nr_map) < 0) + BUG(); + +#ifdef CONFIG_XEN + /* See the comment in parse_memopt(). */ + for (maxmem = rc = 0; rc < e820.nr_map; ++rc) + if (e820.map[rc].type == E820_RAM) + maxmem += e820.map[rc].size >> PAGE_SHIFT; + if (is_initial_xendomain()) { + domid_t domid = DOMID_SELF; + + rc = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); + if (rc > 0 && maxmem > rc) + maxmem = rc; + } + if ((maxmem >> 5) > xen_start_info->nr_pages) { + unsigned long long size = (u64)xen_start_info->nr_pages << 5; + + pr_warn("maxmem of %luM is invalid for an initial" + " allocation of %luM, using %LuM\n", + maxmem >> (20 - PAGE_SHIFT), + xen_start_info->nr_pages >> (20 - PAGE_SHIFT), + size >> (20 - PAGE_SHIFT)); + size <<= PAGE_SHIFT; + e820_remove_range(size, ULLONG_MAX - size, E820_RAM, 1); + } + + if (is_initial_xendomain()) { + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, machine_e820.map); + + if (HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap)) + BUG(); + machine_e820.nr_map = memmap.nr_entries; + } +#endif + + return "Xen"; +} + +void __init setup_memory_map(void) +{ + char *who; + + who = x86_init.resources.memory_setup(); +#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST +#ifdef CONFIG_XEN + if (is_initial_xendomain()) { + printk(KERN_INFO "Xen-provided machine memory map:\n"); + _e820_print_map(&machine_e820, "BIOS"); + } else +#endif + memcpy(&e820_saved, &e820, sizeof(struct e820map)); +#endif + printk(KERN_INFO "Xen-provided physical RAM map:\n"); + _e820_print_map(&e820, who); +} + +void __init memblock_x86_fill(void) +{ + int i; + u64 end; + + /* + * EFI may have more than 128 entries + * We are safe to enable resizing, beause memblock_x86_fill() + * is rather later for x86 + */ + memblock_allow_resize(); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + end = ei->addr + ei->size; + if (end != (resource_size_t)end) + continue; + + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + continue; + + memblock_add(ei->addr, ei->size); + } + +#ifdef CONFIG_XEN + if (max_pfn > xen_start_info->nr_pages) + memblock_reserve(PFN_PHYS(xen_start_info->nr_pages), + PFN_PHYS(max_pfn - xen_start_info->nr_pages)); +#endif + + memblock_dump_all(); +} + +void __init memblock_find_dma_reserve(void) +{ +#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) + u64 nr_pages = 0, nr_free_pages = 0; + unsigned long start_pfn, end_pfn; + phys_addr_t start, end; + int i; + u64 u; + + /* + * need to find out used area below MAX_DMA_PFN + * need to use memblock to get free size in [0, MAX_DMA_PFN] + * at first, and assume boot_mem will not take below MAX_DMA_PFN + */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); + end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + nr_pages += end_pfn - start_pfn; + } + + for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { + start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); + end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); + if (start_pfn < end_pfn) + nr_free_pages += end_pfn - start_pfn; + } + + set_dma_reserve(nr_pages - nr_free_pages); +#endif +} diff --git a/arch/x86/kernel/early_printk-xen.c b/arch/x86/kernel/early_printk-xen.c new file mode 100644 index 0000000..ea02752 --- /dev/null +++ b/arch/x86/kernel/early_printk-xen.c @@ -0,0 +1,291 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_XEN +/* Simple VGA output */ +#define VGABASE (__ISA_IO_base + 0xb8000) + +static int max_ypos = 25, max_xpos = 80; +static int current_ypos = 25, current_xpos; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= max_ypos) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < max_ypos; k++, j++) { + for (i = 0; i < max_xpos; i++) { + writew(readw(VGABASE+2*(max_xpos*k+i)), + VGABASE + 2*(max_xpos*j + i)); + } + } + for (i = 0; i < max_xpos; i++) + writew(0x720, VGABASE + 2*(max_xpos*j + i)); + current_ypos = max_ypos-1; + } +#ifdef CONFIG_KGDB_KDB + if (c == '\b') { + if (current_xpos > 0) + current_xpos--; + } else if (c == '\r') { + current_xpos = 0; + } else +#endif + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(max_xpos*current_ypos + + current_xpos++)); + if (current_xpos >= max_xpos) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +static int early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + if (*s == '\n') + early_serial_putc('\r'); + early_serial_putc(*s); + s++; + } +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned char c; + unsigned divisor; + unsigned baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s, "0x", 2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s, "ttyS", 4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + if (*s) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +#else /* CONFIG_XEN */ + +static void +early_serial_write(struct console *con, const char *s, unsigned count) +{ + int n; + + while (count > 0) { + n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s); + if (n <= 0) + break; + count -= n; + s += n; + } +} + +static __init void early_serial_init(char *s) +{ +} + +/* + * No early VGA console on Xen, as we do not have convenient ISA-space + * mappings. Someone should fix this for domain 0. For now, use fake serial. + */ +#define early_vga_console early_serial_console + +#endif + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Direct interface for emergencies */ +static struct console *early_console = &early_vga_console; +static int __initdata early_console_initialized; + +asmlinkage void early_printk(const char *fmt, ...) +{ + char buf[512]; + int n; + va_list ap; + + va_start(ap, fmt); + n = vscnprintf(buf, sizeof(buf), fmt, ap); + early_console->write(early_console, buf, n); + va_end(ap); +} + +static inline void early_console_register(struct console *con, int keep_early) +{ + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} + +static int __init setup_early_printk(char *buf) +{ + int keep; + + if (!buf) + return 0; + + if (early_console_initialized) + return 0; + early_console_initialized = 1; + + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } +#ifndef CONFIG_XEN + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; +#else + if (!strncmp(buf, "vga", 3) || !strncmp(buf, "xen", 3)) { +#endif + early_console_register(&early_vga_console, keep); + } +#ifdef CONFIG_EARLY_PRINTK_DBGP + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); +#endif +#ifdef CONFIG_HVC_XEN + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); +#endif +#ifdef CONFIG_EARLY_PRINTK_INTEL_MID + if (!strncmp(buf, "mrst", 4)) { + mrst_early_console_init(); + early_console_register(&early_mrst_console, keep); + } + + if (!strncmp(buf, "hsu", 3)) { + hsu_early_console_init(buf + 3); + early_console_register(&early_hsu_console, keep); + } +#endif + buf++; + } + return 0; +} + +early_param("earlyprintk", setup_early_printk); diff --git a/arch/x86/kernel/entry_32-xen.S b/arch/x86/kernel/entry_32-xen.S new file mode 100644 index 0000000..768399e --- /dev/null +++ b/arch/x86/kernel/entry_32-xen.S @@ -0,0 +1,1722 @@ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + + .section .entry.text, "ax" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +/* Pseudo-eflags. */ +NMI_MASK = 0x80000000 + +#ifdef CONFIG_PREEMPT +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +#define preempt_stop(clobbers) +#define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +#ifdef CONFIG_VM86 +#define resume_userspace_sig check_userspace +#else +#define resume_userspace_sig resume_userspace +#endif + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl_cfi $0 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp + CFI_ADJUST_CFA_OFFSET -(4 + \pop) +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl_cfi %gs + /*CFI_REL_OFFSET gs, 0*/ +.endm + +.macro POP_GS pop=0 +98: popl_cfi %gs + /*CFI_RESTORE gs*/ + .if \pop <> 0 + add $\pop, %esp + CFI_ADJUST_CFA_OFFSET -\pop + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro GS_TO_REG reg + movl %gs, \reg + /*CFI_REGISTER gs, \reg*/ +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) + /*CFI_REL_OFFSET gs, PT_GS*/ +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl_cfi %fs + /*CFI_REL_OFFSET fs, 0;*/ + pushl_cfi %es + /*CFI_REL_OFFSET es, 0;*/ + pushl_cfi %ds + /*CFI_REL_OFFSET ds, 0;*/ + pushl_cfi %eax + CFI_REL_OFFSET eax, 0 + pushl_cfi %ebp + CFI_REL_OFFSET ebp, 0 + pushl_cfi %edi + CFI_REL_OFFSET edi, 0 + pushl_cfi %esi + CFI_REL_OFFSET esi, 0 + pushl_cfi %edx + CFI_REL_OFFSET edx, 0 + pushl_cfi %ecx + CFI_REL_OFFSET ecx, 0 + pushl_cfi %ebx + CFI_REL_OFFSET ebx, 0 + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl_cfi %ebx + CFI_RESTORE ebx + popl_cfi %ecx + CFI_RESTORE ecx + popl_cfi %edx + CFI_RESTORE edx + popl_cfi %esi + CFI_RESTORE esi + popl_cfi %edi + CFI_RESTORE edi + popl_cfi %ebp + CFI_RESTORE ebp + popl_cfi %eax + CFI_RESTORE eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl_cfi %ds + /*CFI_RESTORE ds;*/ +2: popl_cfi %es + /*CFI_RESTORE es;*/ +3: popl_cfi %fs + /*CFI_RESTORE fs;*/ + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.section __ex_table, "a" + .align 4 + .long 1b, 4b + .long 2b, 5b + .long 3b, 6b +.popsection + POP_GS_EX +.endm + +.macro RING0_INT_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 3*4 + /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 +.endm + +.macro RING0_EC_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 4*4 + /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 +.endm + +.macro RING0_PTREGS_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ + CFI_OFFSET eip, PT_EIP-PT_OLDESP + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ + CFI_OFFSET eax, PT_EAX-PT_OLDESP + CFI_OFFSET ebp, PT_EBP-PT_OLDESP + CFI_OFFSET edi, PT_EDI-PT_OLDESP + CFI_OFFSET esi, PT_ESI-PT_OLDESP + CFI_OFFSET edx, PT_EDX-PT_OLDESP + CFI_OFFSET ecx, PT_ECX-PT_OLDESP + CFI_OFFSET ebx, PT_EBX-PT_OLDESP +.endm + +ENTRY(ret_from_fork) + CFI_STARTPROC + pushl_cfi %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl_cfi %eax + pushl_cfi $0x0202 # Reset kernel eflags + popfl_cfi + jmp syscall_exit + CFI_ENDPROC +END(ret_from_fork) + +/* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN + RING0_PTREGS_FRAME +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +check_userspace: + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_all +need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(ia32_sysenter_target) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp + movl SYSENTER_stack_sp0(%esp),%esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl_cfi $__USER_DS + /*CFI_REL_OFFSET ss, 0*/ + pushl_cfi %ebp + CFI_REL_OFFSET esp, 0 + pushfl_cfi + orl $X86_EFLAGS_IF, (%esp) + pushl_cfi $__USER_CS + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) + CFI_REL_OFFSET eip, 0 + + pushl_cfi %eax + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp + movl %ebp,PT_EBP(%esp) +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(NR_syscalls), %eax + jae syscall_badsys + call *sys_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jne sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp +#ifdef CONFIG_XEN_VCPU_INFO_PLACEMENT + GET_VCPU_INFO +#endif + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call __audit_syscall_entry + pushl_cfi %ebx + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + call __audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + + CFI_ENDPROC +.pushsection .fixup,"ax" +2: movl $0,PT_FS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection + PTGS_TO_GS_EX +ENDPROC(ia32_sysenter_target) + + # pv sysenter call handler stub +ENTRY(ia32pv_sysenter_target) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,12(%esp) + movl $__USER_CS,4(%esp) + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* +5*4 is SS:ESP,EFLAGS,CS:EIP. +8 is esp0 setting. */ + pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + jmp system_call + CFI_ENDPROC +ENDPROC(ia32pv_sysenter_target) + +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + # system call handler stub +ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway + pushl_cfi %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jne syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifndef CONFIG_XEN + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + CFI_REMEMBER_STATE + je ldt_ss # returning to user-space with LDT SS +restore_nocheck: +#else +restore_nocheck: + movl PT_EFLAGS(%esp), %eax + testl $(X86_EFLAGS_VM|NMI_MASK), %eax + CFI_REMEMBER_STATE + jnz hypervisor_iret + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + GET_VCPU_INFO + andb evtchn_upcall_mask(%esi),%al + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + CFI_REMEMBER_STATE + jnz restore_all_enable_events # != 0 => enable event delivery +#endif + RESTORE_REGS 4 # skip orig_eax/error_code +irq_return: + INTERRUPT_RETURN +.section .fixup,"ax" +ENTRY(iret_exc) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous +.section __ex_table,"a" + .align 4 + .long irq_return,iret_exc +.previous + + CFI_RESTORE_STATE +#ifndef CONFIG_XEN +ldt_ss: + larl PT_OLDSS(%esp), %eax + jnz restore_nocheck + testl $0x00400000, %eax # returning to 32bit stack? + jnz restore_nocheck # allright, normal return + +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl_cfi $__ESPFIX_SS + pushl_cfi %eax /* new kernel esp */ + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck +#else + ALIGN +restore_all_enable_events: + TRACE_IRQS_ON + __ENABLE_INTERRUPTS +scrit: /**** START OF CRITICAL REGION ****/ + __TEST_PENDING + jnz 14f # process more events if necessary... + RESTORE_REGS 4 +1: INTERRUPT_RETURN +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +14: __DISABLE_INTERRUPTS + TRACE_IRQS_OFF +ecrit: /**** END OF CRITICAL REGION ****/ + jmp .Ldo_upcall + + CFI_RESTORE_STATE +hypervisor_iret: + andl $~NMI_MASK, PT_EFLAGS(%esp) + RESTORE_REGS 4 + jmp hypercall_page + (__HYPERVISOR_iret * 32) +#endif + CFI_ENDPROC +ENDPROC(system_call) + + # perform work that needs to be done immediately before resumption + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig + + ALIGN +work_notifysig_v86: + pushl_cfi %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl_cfi %ecx + movl %eax, %esp +#else + movl %esp, %eax +#endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + CFI_ENDPROC + + RING0_INT_FRAME # can't unwind into user space anyway +syscall_fault: + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS,PT_EAX(%esp) + jmp resume_userspace +END(syscall_badsys) + CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection + +/* + * System calls that need a pt_regs pointer. + */ +#define PTREGSCALL0(name) \ +ENTRY(ptregs_##name) ; \ + leal 4(%esp),%eax; \ + jmp sys_##name; \ +ENDPROC(ptregs_##name) + +#define PTREGSCALL1(name) \ +ENTRY(ptregs_##name) ; \ + leal 4(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ + jmp sys_##name; \ +ENDPROC(ptregs_##name) + +#define PTREGSCALL2(name) \ +ENTRY(ptregs_##name) ; \ + leal 4(%esp),%ecx; \ + movl (PT_ECX+4)(%esp),%edx; \ + movl (PT_EBX+4)(%esp),%eax; \ + jmp sys_##name; \ +ENDPROC(ptregs_##name) + +#define PTREGSCALL3(name) \ +ENTRY(ptregs_##name) ; \ + CFI_STARTPROC; \ + leal 4(%esp),%eax; \ + pushl_cfi %eax; \ + movl PT_EDX(%eax),%ecx; \ + movl PT_ECX(%eax),%edx; \ + movl PT_EBX(%eax),%eax; \ + call sys_##name; \ + addl $4,%esp; \ + CFI_ADJUST_CFA_OFFSET -4; \ + ret; \ + CFI_ENDPROC; \ +ENDPROC(ptregs_##name) + +PTREGSCALL1(iopl) +PTREGSCALL0(fork) +PTREGSCALL0(vfork) +PTREGSCALL3(execve) +PTREGSCALL2(sigaltstack) +PTREGSCALL0(sigreturn) +PTREGSCALL0(rt_sigreturn) +PTREGSCALL2(vm86) +PTREGSCALL1(vm86old) + +/* Clone is an oddball. The 4th arg is in %edi */ +ENTRY(ptregs_clone) + CFI_STARTPROC + leal 4(%esp),%eax + pushl_cfi %eax + pushl_cfi PT_EDI(%eax) + movl PT_EDX(%eax),%ecx + movl PT_ECX(%eax),%edx + movl PT_EBX(%eax),%eax + call sys_clone + addl $8,%esp + CFI_ADJUST_CFA_OFFSET -8 + ret + CFI_ENDPROC +ENDPROC(ptregs_clone) + +#ifndef CONFIG_XEN +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl_cfi $__KERNEL_DS + pushl_cfi %eax + lss (%esp), %esp /* switch to the normal stack segment */ + CFI_ADJUST_CFA_OFFSET -8 +.endm +.macro UNWIND_ESPFIX_STACK + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +.endm + +/* + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. + */ +.section .init.rodata,"a" +ENTRY(interrupt) +.section .entry.text, "ax" + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT +ENTRY(irq_entries_start) + RING0_INT_FRAME +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR + CFI_ADJUST_CFA_OFFSET -4 + .endif +1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous + .long 1b + .section .entry.text, "ax" +vector=vector+1 + .endif + .endr +2: jmp common_interrupt +.endr +END(irq_entries_start) + +.previous +END(interrupt) +.previous + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + CFI_ENDPROC + +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + pushl_cfi $~(nr); \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call fn; \ + jmp ret_from_intr; \ + CFI_ENDPROC; \ +ENDPROC(name) + +#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) + +/* The include is where all of the SMP etc. interrupts come from */ +#include + +#else +#define UNWIND_ESPFIX_STACK + + .pushsection .kprobes.text, "ax" + +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +# +# The sysexit critical region is slightly different. sysexit +# atomically removes the entire stack frame. If we interrupt in the +# critical region we know that the entire frame is present and correct +# so we can simply throw away the new one. +ENTRY(hypervisor_callback) + RING0_INT_FRAME + pushl_cfi %eax + SAVE_ALL + movl PT_CS(%esp),%ecx + movl PT_EIP(%esp),%eax + andl $SEGMENT_RPL_MASK,%ecx + cmpl $USER_RPL,%ecx + jae .Ldo_upcall + cmpl $scrit,%eax + jb 0f + cmpl $ecrit,%eax + jb critical_region_fixup +0: +#ifdef CONFIG_XEN_SUPERVISOR_MODE_KERNEL + cmpl $sysexit_scrit,%eax + jb .Ldo_upcall + cmpl $sysexit_ecrit,%eax + ja .Ldo_upcall + addl $PT_OLDESP,%esp # Remove eflags...ebx from stack frame. +#endif +.Ldo_upcall: + pushl_cfi %esp + call evtchn_do_upcall + add $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + jmp ret_from_intr + CFI_ENDPROC + +# [How we do the fixup]. We want to merge the current stack frame with the +# just-interrupted frame. How we do this depends on where in the critical +# region the interrupted handler was executing, and so how many saved +# registers are in each frame. We do this quickly using the lookup table +# 'critical_fixup_table'. For each byte offset in the critical region, it +# provides the number of bytes which have already been popped from the +# interrupted stack frame. +critical_region_fixup: + movsbl critical_fixup_table-scrit(%eax),%ecx # %ecx contains num slots popped + testl %ecx,%ecx + leal (%esp,%ecx,4),%esi # %esi points at end of src region + leal PT_OLDESP(%esp),%edi # %edi points at end of dst region + jle 17f # skip loop if nothing to copy +16: subl $4,%esi # pre-decrementing copy loop + subl $4,%edi + movl (%esi),%eax + movl %eax,(%edi) + loop 16b +17: movl %edi,%esp # final %edi is top of merged stack + jmp .Ldo_upcall + +.section .rodata,"a" +critical_fixup_table: + .rept __SIZEOF_TEST_PENDING + .byte -1 + .endr + .byte -1,-1 # jnz 14f + .byte 0 # pop %ebx + .byte 1 # pop %ecx + .byte 2 # pop %edx + .byte 3 # pop %esi + .byte 4 # pop %edi + .byte 5 # pop %ebp + .byte 6 # pop %eax + .byte 7 # pop %ds + .byte 8 # pop %es + .byte 9,9 # pop %fs +#ifndef CONFIG_X86_32_LAZY_GS + .byte 10,10 # pop %gs + .byte 11,11,11 # add $4,%esp +#else + .byte 10,10,10 # add $8,%esp +#endif + .byte 12 # iret + .rept __SIZEOF_DISABLE_INTERRUPTS + .byte -1 + .endr +.previous + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(failsafe_callback) + pushl %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + jz 5f + addl $16,%esp # EAX != 0 => Category 2 (Bad IRET) + jmp iret_exc +5: addl $16,%esp # EAX == 0 => Category 1 (Bad segment) + RING0_INT_FRAME + pushl $0 + SAVE_ALL + jmp ret_from_exception +.section .fixup,"ax"; \ +6: xorl %eax,%eax; \ + movl %eax,4(%esp); \ + jmp 1b; \ +7: xorl %eax,%eax; \ + movl %eax,8(%esp); \ + jmp 2b; \ +8: xorl %eax,%eax; \ + movl %eax,12(%esp); \ + jmp 3b; \ +9: xorl %eax,%eax; \ + movl %eax,16(%esp); \ + jmp 4b; \ +.previous; \ +.section __ex_table,"a"; \ + .align 4; \ + .long 1b,6b; \ + .long 2b,7b; \ + .long 3b,8b; \ + .long 4b,9b; \ +.previous +#endif + CFI_ENDPROC + +ENTRY(coprocessor_error) + RING0_INT_FRAME + pushl_cfi $0 + pushl_cfi $do_coprocessor_error + jmp error_code + CFI_ENDPROC +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + RING0_INT_FRAME + pushl_cfi $0 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ +661: pushl_cfi $do_general_protection +662: +.section .altinstructions,"a" + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f +.previous +.section .altinstr_replacement,"ax" +663: pushl $do_simd_coprocessor_error +664: +.previous +#else + pushl_cfi $do_simd_coprocessor_error +#endif + jmp error_code + CFI_ENDPROC +END(simd_coprocessor_error) + +ENTRY(device_not_available) + RING0_INT_FRAME + pushl_cfi $-1 # mark this as an int + pushl_cfi $do_device_not_available + jmp error_code + CFI_ENDPROC +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret +.section __ex_table,"a" + .align 4 + .long native_iret, iret_exc +.previous +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + RING0_INT_FRAME + pushl_cfi $0 + pushl_cfi $do_overflow + jmp error_code + CFI_ENDPROC +END(overflow) + +ENTRY(bounds) + RING0_INT_FRAME + pushl_cfi $0 + pushl_cfi $do_bounds + jmp error_code + CFI_ENDPROC +END(bounds) + +ENTRY(invalid_op) + RING0_INT_FRAME + pushl_cfi $0 + pushl_cfi $do_invalid_op + jmp error_code + CFI_ENDPROC +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME + pushl_cfi $0 + pushl_cfi $do_coprocessor_segment_overrun + jmp error_code + CFI_ENDPROC +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + RING0_EC_FRAME + pushl_cfi $do_invalid_TSS + jmp error_code + CFI_ENDPROC +END(invalid_TSS) + +ENTRY(segment_not_present) + RING0_EC_FRAME + pushl_cfi $do_segment_not_present + jmp error_code + CFI_ENDPROC +END(segment_not_present) + +ENTRY(stack_segment) + RING0_EC_FRAME + pushl_cfi $do_stack_segment + jmp error_code + CFI_ENDPROC +END(stack_segment) + +ENTRY(alignment_check) + RING0_EC_FRAME + pushl_cfi $do_alignment_check + jmp error_code + CFI_ENDPROC +END(alignment_check) + +ENTRY(divide_error) + RING0_INT_FRAME + pushl_cfi $0 # no error code + pushl_cfi $do_divide_error + jmp error_code + CFI_ENDPROC +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + RING0_INT_FRAME + pushl_cfi $0 + pushl_cfi machine_check_vector + jmp error_code + CFI_ENDPROC +END(machine_check) +#endif + +#ifndef CONFIG_XEN +ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + pushl_cfi $0 + pushl_cfi $do_spurious_interrupt_bug + jmp error_code + CFI_ENDPROC +END(spurious_interrupt_bug) +#endif /* !CONFIG_XEN */ + +ENTRY(fixup_4gb_segment) + RING0_EC_FRAME + pushl_cfi $do_fixup_4gb_segment + jmp error_code + CFI_ENDPROC +END(fixup_4gb_segment) +/* + * End of kprobes section + */ + .popsection + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movl 4(%esp), %edx + movl (%esp), %ecx + leal 4(%esp), %eax + movl %ebx, PT_EBX(%edx) + xorl %ebx, %ebx + movl %ebx, PT_ECX(%edx) + movl %ebx, PT_EDX(%edx) + movl %esi, PT_ESI(%edx) + movl %edi, PT_EDI(%edx) + movl %ebp, PT_EBP(%edx) + movl %ebx, PT_EAX(%edx) + movl $__USER_DS, PT_DS(%edx) + movl $__USER_DS, PT_ES(%edx) + movl $__KERNEL_PERCPU, PT_FS(%edx) + movl $__KERNEL_STACK_CANARY, PT_GS(%edx) + movl %eax, PT_OLDESP(%edx) + movl 16(%esp), %eax + movl %ebx, PT_ORIG_EAX(%edx) + movl %ecx, PT_EIP(%edx) + movl 12(%esp), %ecx + movl $__KERNEL_CS, PT_CS(%edx) + movl %eax, 12(%esp) + movl 8(%esp), %eax + movl %ecx, 8(%esp) + movl %ebx, PT_EFLAGS(%edx) + movl PT_EBX(%edx), %ebx + movl $__KERNEL_DS, PT_OLDSS(%edx) + jmpl *%eax + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif + +ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder + CFI_STARTPROC + movl %edi,%eax + call *%esi + call do_exit + ud2 # padding for call trace + CFI_ENDPROC +ENDPROC(kernel_thread_helper) + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + popl %edx + popl %ecx + popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + lea 0x4(%ebp), %eax + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %edx + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif + +#ifdef TIF_CSTAR + # pv syscall call handler stub +ENTRY(ia32pv_cstar_target) + RING0_INT_FRAME + movl $__USER_DS,16(%esp) + movl %ebp,%ecx + movl $__USER_CS,4(%esp) + movl 12(%esp),%ebp + pushl_cfi %eax # save orig_eax +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-4,%ebp + CFI_REMEMBER_STATE + ja cstar_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,cstar_fault +.previous + SAVE_ALL + GET_THREAD_INFO(%ebp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz cstar_trace_entry + cmpl $NR_syscalls,%eax + jae cstar_badsys +.Lcstar_call: + btl %eax,cstar_special + jc .Lcstar_special + call *cstar_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) # store the return value +.Lcstar_exit: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_exit +.Lcstar_special: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp syscall_call +GLOBAL(cstar_set_tif) + movl $cstar_clear_tif,(%esp) # replace return address + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + jmp *sys_call_table(,%eax,4) +cstar_clear_tif: + movl %eax,PT_EAX(%esp) # store the return value + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp .Lcstar_exit +cstar_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + cmpl $NR_syscalls,%eax + jae 1f + btl %eax,cstar_special + jc .Lcstar_trace_special +1: movl %esp,%eax + LOCK_PREFIX + orl $_TIF_CSTAR,TI_flags(%ebp) + call syscall_trace_enter + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + /* What it returned is what we'll actually use. */ + cmpl $NR_syscalls,%eax + jb .Lcstar_call + jmp .Lcstar_exit +.Lcstar_trace_special: + movl PT_ECX(%esp),%ecx + movl %esp,%eax + movl %ecx,PT_EBP(%esp) # put user EBP back in place + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $NR_syscalls,%eax + jb syscall_call + jmp syscall_exit +cstar_badsys: + movl $-ENOSYS,PT_EAX(%esp) +.Lcstar_resume: + movl PT_ECX(%esp),%ecx + movl %ecx,PT_EBP(%esp) # put user EBP back in place + jmp resume_userspace + CFI_RESTORE_STATE +cstar_fault: + movl $-EFAULT,%eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + jmp .Lcstar_resume + CFI_ENDPROC +ENDPROC(ia32pv_cstar_target) + +ENTRY(cstar_ret_from_fork) + CFI_STARTPROC + movl PT_ECX(%esp),%ecx + GET_THREAD_INFO(%ebp) + movl %ecx,PT_EBP(%esp) # put user EBP back in place + LOCK_PREFIX + andl $~_TIF_CSTAR,TI_flags(%ebp) + jmp ret_from_fork + CFI_ENDPROC +END(cstar_ret_from_fork) + +#include +.pushsection .rodata,"a" +.balign 4 +cstar_special: +nr=0 +mask=0 +.rept NR_syscalls+31 + .irp n, __NR_sigreturn, __NR_rt_sigreturn + .if nr == \n + mask = mask | (1 << (\n & 31)) + .endif + .endr + nr = nr + 1 + .if (nr & 31) == 0 + .long mask + mask = 0 + .endif +.endr +.popsection +#endif /* TIF_CSTAR */ + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +ENTRY(page_fault) + RING0_EC_FRAME + pushl_cfi $do_page_fault + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl_cfi %fs + /*CFI_REL_OFFSET fs, 0*/ + pushl_cfi %es + /*CFI_REL_OFFSET es, 0*/ + pushl_cfi %ds + /*CFI_REL_OFFSET ds, 0*/ + pushl_cfi %eax + CFI_REL_OFFSET eax, 0 + pushl_cfi %ebp + CFI_REL_OFFSET ebp, 0 + pushl_cfi %edi + CFI_REL_OFFSET edi, 0 + pushl_cfi %esi + CFI_REL_OFFSET esi, 0 + pushl_cfi %edx + CFI_REL_OFFSET edx, 0 + pushl_cfi %ecx + CFI_REL_OFFSET ecx, 0 + pushl_cfi %ebx + CFI_REL_OFFSET ebx, 0 + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +#ifndef CONFIG_XEN +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + CFI_DEF_CFA esp, 0 + CFI_UNDEFINED eip + pushfl_cfi + pushl_cfi $__KERNEL_CS + pushl_cfi $sysenter_past_esp + CFI_REL_OFFSET eip, 0 +.endm +#endif /* CONFIG_XEN */ + +ENTRY(debug) + RING0_INT_FRAME +#ifndef CONFIG_XEN + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: +#endif /* !CONFIG_XEN */ + pushl_cfi $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + pushl_cfi %eax +#ifndef CONFIG_XEN + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl_cfi %eax + je nmi_espfix_stack + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl_cfi %eax + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl_cfi %eax + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl_cfi %eax + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl_cfi %ss + pushl_cfi %esp + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl_cfi 16(%esp) + .endr + pushl_cfi %eax + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return +#else + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + orl $NMI_MASK, PT_EFLAGS(%esp) + jmp restore_all +#endif + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + pushl_cfi $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl_cfi $do_general_protection + jmp error_code + CFI_ENDPROC +END(general_protection) + +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + RING0_EC_FRAME + pushl_cfi $do_async_page_fault + jmp error_code + CFI_ENDPROC +END(async_page_fault) +#endif + +/* + * End of kprobes section + */ + .popsection diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4441d47..b0d761c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -377,7 +377,7 @@ ENTRY(ia32_sysenter_target) CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp - movl TSS_sysenter_sp0(%esp),%esp + movl SYSENTER_stack_sp0(%esp),%esp sysenter_past_esp: /* * Interrupts are disabled here, but we can't trace it until @@ -1045,7 +1045,7 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN /* Xen doesn't set %esp to be precisely what the normal sysenter entrypoint expects, so fix it up before using the normal path. */ ENTRY(xen_sysenter_target) @@ -1137,7 +1137,7 @@ ENDPROC(xen_failsafe_callback) BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, xen_evtchn_do_upcall) -#endif /* CONFIG_XEN */ +#endif /* CONFIG_PARAVIRT_XEN */ #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -1300,7 +1300,7 @@ END(page_fault) * that sets up the real kernel stack. Check here, since we can't * allow the wrong stack to be used. * - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * "SYSENTER_stack_sp0+12" is because the NMI/debug handler will have * already pushed 3 words if it hits on the sysenter instruction: * eflags, cs and eip. * @@ -1312,7 +1312,7 @@ END(page_fault) cmpw $__KERNEL_CS, 4(%esp) jne \ok \label: - movl TSS_sysenter_sp0 + \offset(%esp), %esp + movl SYSENTER_stack_sp0 + \offset(%esp), %esp CFI_DEF_CFA esp, 0 CFI_UNDEFINED eip pushfl_cfi diff --git a/arch/x86/kernel/entry_64-xen.S b/arch/x86/kernel/entry_64-xen.S new file mode 100644 index 0000000..0f08b7a --- /dev/null +++ b/arch/x86/kernel/entry_64-xen.S @@ -0,0 +1,1385 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek + * Jun Nakajima + * Asit Mallick + * Modified for Xen + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * Some of this is documented in Documentation/x86/entry_64.txt + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * Normal syscalls and interrupts don't save a full stack frame, this is + * only done for syscall tracing, signals or fork/exec et.al. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * - partial stack frame: partially saved registers up to R11. + * - full stack frame: Like partial stack frame, but all register saved. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - errorentry/paranoidentry/zeroentry - Define exception entry points. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + + .code64 + .section .entry.text, "ax" + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE +ENTRY(mcount) + retq +END(mcount) + +ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + MCOUNT_SAVE_FRAME + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + +GLOBAL(ftrace_call) + call ftrace_stub + + MCOUNT_RESTORE_FRAME + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +GLOBAL(ftrace_graph_call) + jmp ftrace_stub +#endif + +GLOBAL(ftrace_stub) + retq +END(ftrace_caller) + +#else /* ! CONFIG_DYNAMIC_FTRACE */ +ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + + cmpq $ftrace_stub, ftrace_trace_function + jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + +GLOBAL(ftrace_stub) + retq + +trace: + MCOUNT_SAVE_FRAME + + movq 0x38(%rsp), %rdi + movq 8(%rbp), %rsi + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + MCOUNT_RESTORE_FRAME + + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + MCOUNT_SAVE_FRAME + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + movq (%rbp), %rdx + subq $MCOUNT_INSN_SIZE, %rsi + + call prepare_ftrace_return + + MCOUNT_RESTORE_FRAME + + retq +END(ftrace_graph_caller) + +GLOBAL(return_to_handler) + subq $24, %rsp + + /* Save the return values */ + movq %rax, (%rsp) + movq %rdx, 8(%rsp) + movq %rbp, %rdi + + call ftrace_return_to_handler + + movq %rax, %rdi + movq 8(%rsp), %rdx + movq (%rsp), %rax + addq $24, %rsp + jmp *%rdi +#endif + + +#ifndef CONFIG_PREEMPT +#define retint_kernel retint_restore_args +#endif + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +NMI_MASK = 0x80000000 + +/* + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based + * fast path FIXUP_TOP_OF_STACK is needed. + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs + * manipulation. + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp offset=0 + movq $__USER_CS,CS+\offset(%rsp) + movq $-1,RCX+\offset(%rsp) + .endm + + .macro RESTORE_TOP_OF_STACK tmp offset=0 + .endm + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorl %eax, %eax + pushq_cfi $__KERNEL_DS /* ss */ + /*CFI_REL_OFFSET ss,0*/ + pushq_cfi %rax /* rsp */ + CFI_REL_OFFSET rsp,0 + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ + /*CFI_REL_OFFSET rflags,0*/ + pushq_cfi $__KERNEL_CS /* cs */ + /*CFI_REL_OFFSET cs,0*/ + pushq_cfi \child_rip /* rip */ + CFI_REL_OFFSET rip,0 + pushq_cfi %rax /* orig rax */ + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + CFI_ADJUST_CFA_OFFSET -(6*8) + .endm + +/* + * initial frame state for syscall + */ + .macro BASIC_FRAME start=1 offset=0 + .if \start + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp, SS+8+\offset-RIP + .else + CFI_DEF_CFA_OFFSET SS+8+\offset-RIP + .endif + /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ + CFI_REL_OFFSET rsp, RSP+\offset-RIP + /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ + /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ + CFI_REL_OFFSET rip, RIP+\offset-RIP + .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro INTR_FRAME start=1 offset=0 + .if \start == 1 + BASIC_FRAME 1, \offset+2*8 + CFI_REL_OFFSET rcx, 0+\offset + CFI_REL_OFFSET r11, 8+\offset + .else + BASIC_FRAME \start, \offset + .endif + .endm + +/* + * initial frame state for exceptions with error code (and interrupts + * with vector already pushed) + */ + .macro XCPT_FRAME start=1 offset=0 + INTR_FRAME \start, RIP+\offset-ORIG_RAX + .endm + +/* + * frame that enables calling into C. + */ + .macro PARTIAL_FRAME start=1 offset=0 + .if \start >= 0 + XCPT_FRAME 2*\start, ORIG_RAX+\offset-ARGOFFSET + .endif + CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET + CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET + CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET + CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET + CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET + CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET + CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET + CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET + CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + .endm + +/* + * frame that enables passing a complete pt_regs to a C function. + */ + .macro DEFAULT_FRAME start=1 offset=0 + .if \start >= -1 + PARTIAL_FRAME \start, R11+\offset-R15 + .endif + CFI_REL_OFFSET rbx, RBX+\offset + CFI_REL_OFFSET rbp, RBP+\offset + CFI_REL_OFFSET r12, R12+\offset + CFI_REL_OFFSET r13, R13+\offset + CFI_REL_OFFSET r14, R14+\offset + CFI_REL_OFFSET r15, R15+\offset + .endm + + /* + * Must be consistent with the definition in arch-x86/xen-x86_64.h: + * struct iret_context { + * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; + * }; + * with rax, r11, and rcx being taken care of in the hypercall stub. + */ + .macro HYPERVISOR_IRET flag + .if \flag == 0 # return from syscall always uses the hypercall + testb $3,1*8(%rsp) + jnz 2f + testl $NMI_MASK,2*8(%rsp) + jnz 2f + + cmpb $0,(xen_features+XENFEAT_supervisor_mode_kernel)(%rip) + jne 1f + + /* Direct iret to kernel space. Correct CS and SS. */ + orl $3,1*8(%rsp) + orl $3,4*8(%rsp) +1: iretq + .endif + +2: /* Slow iret via hypervisor. */ + andl $~NMI_MASK, 2*8(%rsp) + pushq $\flag & VGCF_in_syscall + jmp hypercall_page + (__HYPERVISOR_iret * 32) + .endm + +#ifndef CONFIG_XEN +/* save partial stack frame */ + .macro SAVE_ARGS_IRQ + cld + /* start from rbp in pt_regs and jump over */ + movq_cfi rdi, RDI-RBP + movq_cfi rsi, RSI-RBP + movq_cfi rdx, RDX-RBP + movq_cfi rcx, RCX-RBP + movq_cfi rax, RAX-RBP + movq_cfi r8, R8-RBP + movq_cfi r9, R9-RBP + movq_cfi r10, R10-RBP + movq_cfi r11, R11-RBP + + /* Save rbp so that we can unwind from get_irq_regs() */ + movq_cfi rbp, 0 + + /* Save previous stack value */ + movq %rsp, %rsi + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ + testl $3, CS(%rdi) + je 1f + SWAPGS + /* + * irq_count is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +1: incl PER_CPU_VAR(irq_count) + jne 2f + mov PER_CPU_VAR(irq_stack_ptr),%rsp + CFI_DEF_CFA_REGISTER rsi + +2: /* Store previous stack value */ + pushq %rsi + CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ + 0x77 /* DW_OP_breg7 */, 0, \ + 0x06 /* DW_OP_deref */, \ + 0x08 /* DW_OP_const1u */, SS+8-RBP, \ + 0x22 /* DW_OP_plus */ + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF + .endm +#endif + +ENTRY(save_rest) + CFI_STARTPROC + movq 5*8+16(%rsp), %r11 /* save return address */ + movq %rbx, RBX+16(%rsp) + movq %rbp, RBP+16(%rsp) + movq %r12, R12+16(%rsp) + movq %r13, R13+16(%rsp) + movq %r14, R14+16(%rsp) + movq %r15, R15+16(%rsp) + movq %r11, 8(%rsp) /* return address */ + FIXUP_TOP_OF_STACK %r11, 16 + ret + CFI_ENDPROC +END(save_rest) + +#ifndef CONFIG_XEN +/* save complete stack frame */ + .pushsection .kprobes.text, "ax" +ENTRY(save_paranoid) + XCPT_FRAME offset=ORIG_RAX-R15+8 + cld + movq %rdi, RDI+8(%rsp) + movq %rsi, RSI+8(%rsp) + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq %r8, R8+8(%rsp) + movq %r9, R9+8(%rsp) + movq %r10, R10+8(%rsp) + movq %r11, R11+8(%rsp) + movq_cfi rbx, RBX+8 + movq %rbp, RBP+8(%rsp) + movq %r12, R12+8(%rsp) + movq %r13, R13+8(%rsp) + movq %r14, R14+8(%rsp) + movq %r15, R15+8(%rsp) + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(save_paranoid) + .popsection +#endif + +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + DEFAULT_FRAME + + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq_cfi kernel_eflags(%rip) + popfq_cfi # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + GET_THREAD_INFO(%rcx) + + RESTORE_REST + + testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? + jnz 1f + /* Need to set the proper %ss (not NULL) for ring 3 iretq */ + movl $__KERNEL_DS,SS-ARGOFFSET(%rsp) + jmp retint_restore_args +1: + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET + jnz int_ret_from_sys_call + + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET + jmp ret_from_sys_call # go to the SYSRET fastpath + + CFI_ENDPROC +END(ret_from_fork) + +/* + * System call entry. Up to 6 arguments in registers are supported. + * + * SYSCALL does not save anything on the stack and does not change the + * stack pointer. + */ + +/* + * Register setup: + * rax system call number + * rdi arg0 + * rcx return address for syscall/sysret, C arg3 + * rsi arg1 + * rdx arg2 + * r10 arg3 (--> moved to rcx for C) + * r8 arg4 + * r9 arg5 + * r11 eflags for syscall/sysret, temporary for C + * r12-r15,rbp,rbx saved by C code, not touched. + * + * Interrupts are enabled on entry. + * Only called from user space. + * + * XXX if we had a free scratch register we could save the RSP into the stack frame + * and report it properly in ps. Unfortunately we haven't. + * + * When user can change the frames always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + +ENTRY(system_call) + INTR_FRAME start=2 offset=2*8 + SAVE_ARGS -8,0 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jnz tracesys +system_call_fastpath: + cmpq $__NR_syscall_max,%rax + ja badsys + movq %r10,%rcx + call *sys_call_table(,%rax,8) # XXX: rip relative + movq %rax,RAX-ARGOFFSET(%rsp) +/* + * Syscall return path ending with SYSRET (fast path) + * Has incomplete stack frame and undefined top of stack. + */ +ret_from_sys_call: + movl $_TIF_ALLWORK_MASK,%edi + /* edi: flagmask */ +sysret_check: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx + andl %edi,%edx + jnz sysret_careful + CFI_REMEMBER_STATE + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON + RESTORE_ARGS 1,8,0,0 + xor %ecx,%ecx + xor %r11,%r11 + HYPERVISOR_IRET VGCF_IN_SYSCALL + + CFI_RESTORE_STATE + /* Handle reschedules */ + /* edx: work, edi: workmask */ +sysret_careful: + bt $TIF_NEED_RESCHED,%edx + jnc sysret_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq_cfi %rdi + call schedule + popq_cfi %rdi + jmp sysret_check + + /* Handle a signal */ +sysret_signal: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) +#ifdef CONFIG_AUDITSYSCALL + bt $TIF_SYSCALL_AUDIT,%edx + jc sysret_audit +#endif + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work + +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + +#ifdef CONFIG_AUDITSYSCALL + /* + * Fast path for syscall audit without full syscall trace. + * We just call __audit_syscall_entry() directly, and then + * jump back to the normal fast path. + */ +auditsys: + movq %r10,%r9 /* 6th arg: 4th syscall arg */ + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ + movq %rax,%rsi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + call __audit_syscall_entry + LOAD_ARGS 0 /* reload call-clobbered registers */ + jmp system_call_fastpath + + /* + * Return fast path for syscall audit. Call __audit_syscall_exit() + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT + * masked off. + */ +sysret_audit: + movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + jmp sysret_check +#endif /* CONFIG_AUDITSYSCALL */ + + /* Do syscall tracing */ +tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jz auditsys +#endif + SAVE_REST + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + call syscall_trace_enter + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + LOAD_ARGS ARGOFFSET, 1 + RESTORE_REST + cmpq $__NR_syscall_max,%rax + ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + /* Use IRET because user could have changed frame */ + +/* + * Syscall return path ending with IRET. + * Has correct top of stack, but partial stack frame. + */ +GLOBAL(int_ret_from_sys_call) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +GLOBAL(int_with_check) + LOCKDEP_SYS_EXIT_IRQ + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + andl $~TS_COMPAT,TI_status(%rcx) + jmp retint_restore_args + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq_cfi %rdi + call schedule + popq_cfi %rdi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + + /* handle signals and tracing -- both require a full stack frame */ +int_very_careful: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: + SAVE_REST + /* Check for syscall exit trace */ + testl $_TIF_WORK_SYSCALL_EXIT,%edx + jz int_signal + pushq_cfi %rdi + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq_cfi %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi + jmp int_restore_rest + +int_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_WORK_MASK,%edi +int_restore_rest: + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + CFI_ENDPROC +END(system_call) + +/* + * Certain special system calls that need to save a complete full stack frame. + */ + .macro PTREGSCALL label,func,arg +ENTRY(\label) + PARTIAL_FRAME 1 8 /* offset 8: return address */ + subq $REST_SKIP, %rsp + CFI_ADJUST_CFA_OFFSET REST_SKIP + call save_rest + DEFAULT_FRAME -2 8 /* offset 8: return address */ + leaq 8(%rsp), \arg /* pt_regs pointer */ + call \func + jmp ptregscall_common + CFI_ENDPROC +END(\label) + .endm + + PTREGSCALL stub_clone, sys_clone, %r8 + PTREGSCALL stub_fork, sys_fork, %rdi + PTREGSCALL stub_vfork, sys_vfork, %rdi + PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx + PTREGSCALL stub_iopl, sys_iopl, %rsi + +ENTRY(ptregscall_common) + DEFAULT_FRAME 1 8 /* offset 8: return address */ + RESTORE_TOP_OF_STACK %r11, 8 + movq_cfi_restore R15+8, r15 + movq_cfi_restore R14+8, r14 + movq_cfi_restore R13+8, r13 + movq_cfi_restore R12+8, r12 + movq_cfi_restore RBP+8, rbp + movq_cfi_restore RBX+8, rbx + ret $REST_SKIP /* pop extended registers */ + CFI_ENDPROC +END(ptregscall_common) + +ENTRY(stub_execve) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + movq %rsp, %rcx + call sys_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execve) + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + movq %rsp,%rdi + FIXUP_TOP_OF_STACK %r11 + call sys_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_rt_sigreturn) + +/* + * Interrupt exit. + */ + +retint_with_reschedule: + PARTIAL_FRAME + movl $_TIF_WORK_MASK,%edi +retint_check: + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + andl %edi,%edx + CFI_REMEMBER_STATE + jnz retint_careful +retint_restore_args: /* return to kernel space */ + movl EFLAGS-REST_SKIP(%rsp), %eax + shr $9, %eax # EAX[0] == IRET_EFLAGS.IF + GET_VCPU_INFO + andb evtchn_upcall_mask(%rsi),%al + andb $1,%al # EAX[0] == IRET_EFLAGS.IF & event_mask + jnz restore_all_enable_events # != 0 => enable event delivery + + RESTORE_ARGS 1,8,1 + HYPERVISOR_IRET 0 + + /* edi: workmask, edx: work */ +retint_careful: + CFI_RESTORE_STATE + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq_cfi %rdi + call schedule + popq_cfi %rdi + GET_THREAD_INFO(%rcx) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp retint_check + +retint_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz retint_restore_args + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_REST + movq $-1,ORIG_RAX(%rsp) + xorl %esi,%esi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_with_reschedule + +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption */ + /* rcx: threadinfo. interrupts off. */ +ENTRY(retint_kernel) + cmpl $0,TI_preempt_count(%rcx) + jnz retint_restore_args + bt $TIF_NEED_RESCHED,TI_flags(%rcx) + jnc retint_restore_args + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + jnc retint_restore_args + call preempt_schedule_irq + jmp retint_kernel /* check again */ +#endif + + CFI_ENDPROC +END(retint_check) + +#ifndef CONFIG_XEN +/* + * APIC interrupts. + */ +.macro apicinterrupt num sym do_sym +ENTRY(\sym) + INTR_FRAME + pushq_cfi $~(\num) + interrupt \do_sym + jmp error_entry + CFI_ENDPROC +END(\sym) +.endm + +#ifdef CONFIG_SMP +apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt +#endif + +#ifdef CONFIG_X86_UV +apicinterrupt UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +#endif +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi + +#ifdef CONFIG_SMP + ALIGN + INTR_FRAME +.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if NUM_INVALIDATE_TLB_VECTORS > \idx +ENTRY(invalidate_interrupt\idx) + pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) + jmp .Lcommon_invalidate_interrupt0 + CFI_ADJUST_CFA_OFFSET -8 +END(invalidate_interrupt\idx) +.endif +.endr + CFI_ENDPROC +apicinterrupt INVALIDATE_TLB_VECTOR_START, \ + invalidate_interrupt0, smp_invalidate_interrupt +#endif + +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt smp_threshold_interrupt +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif + +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt + +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt +#endif +#endif /* !CONFIG_XEN */ + +/* + * Exception entry points. + */ +.macro zeroentry sym do_sym +ENTRY(\sym) + INTR_FRAME + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + movq $-1,8(%rsp) /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15-1*8,%rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-1*8 + call error_entry + DEFAULT_FRAME -1 + movq %rsp,%rdi /* pt_regs pointer */ + xorl %esi,%esi /* no error code */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC +END(\sym) +.endm + +.macro paranoidzeroentry sym do_sym + zeroentry \sym \do_sym +.endm + +.macro paranoidzeroentry_ist sym do_sym ist + zeroentry \sym \do_sym +.endm + +.macro errorentry sym do_sym +ENTRY(\sym) + XCPT_FRAME + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + subq $ORIG_RAX-R15-2*8,%rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15-2*8 + call error_entry + DEFAULT_FRAME -1 + movq %rsp,%rdi /* pt_regs pointer */ + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + call \do_sym + jmp error_exit /* %ebx: no swapgs flag */ + CFI_ENDPROC +END(\sym) +.endm + + /* error code is on the stack already */ +.macro paranoiderrorentry sym do_sym + errorentry \sym \do_sym +.endm + +/* + * Copied from arch/xen/i386/kernel/entry.S + */ +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +ENTRY(do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will +# see the correct pointer to the pt_regs + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + DEFAULT_FRAME +11: incl PER_CPU_VAR(irq_count) + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rbp # backlink for old unwinder + call evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl PER_CPU_VAR(irq_count) + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + + ALIGN +restore_all_enable_events: + PARTIAL_FRAME + TRACE_IRQS_ON + __ENABLE_INTERRUPTS + +scrit: /**** START OF CRITICAL REGION ****/ + __TEST_PENDING + CFI_REMEMBER_STATE + jnz 14f # process more events if necessary... + RESTORE_ARGS 1,8,1 + HYPERVISOR_IRET 0 + + CFI_RESTORE_STATE +14: __DISABLE_INTERRUPTS + SAVE_REST + movq %rsp,%rdi # set the argument again + jmp 11b + CFI_ENDPROC +ecrit: /**** END OF CRITICAL REGION ****/ +# At this point, unlike on x86-32, we don't do the fixup to simplify the +# code and the stack frame is more complex on x86-64. +# When the kernel is interrupted in the critical section, the kernel +# will do IRET in that case, and everything will be restored at that point, +# i.e. it just resumes from the next instruction interrupted with the same context. + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +ENTRY(failsafe_callback) + INTR_FRAME offset=4*8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + movq $11,%rdi /* SIGSEGV */ + jmp do_exit + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq_cfi $0 + SAVE_ALL + jmp error_exit + CFI_ENDPROC + +zeroentry divide_error do_divide_error +zeroentry overflow do_overflow +zeroentry bounds do_bounds +zeroentry invalid_op do_invalid_op +zeroentry device_not_available do_device_not_available +zeroentry hypervisor_callback do_hypervisor_callback +zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun +errorentry invalid_TSS do_invalid_TSS +errorentry segment_not_present do_segment_not_present +zeroentry coprocessor_error do_coprocessor_error +errorentry alignment_check do_alignment_check +zeroentry simd_coprocessor_error do_simd_coprocessor_error + +ENTRY(kernel_thread_helper) + pushq $0 # fake return address + CFI_STARTPROC + /* + * Here we are in the child and the registers are set as they were + * at kernel_thread() invocation in the parent. + */ + call *%rsi + # exit + mov %eax, %edi + call do_exit + ud2 # padding for call trace + CFI_ENDPROC +END(kernel_thread_helper) + +/* + * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. + * + * C extern interface: + * extern long execve(const char *name, char **argv, char **envp) + * + * asm input arguments: + * rdi: name, rsi: argv, rdx: envp + * + * We want to fallback into: + * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs) + * + * do_sys_execve asm fallback arguments: + * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack + */ +ENTRY(kernel_execve) + CFI_STARTPROC + FAKE_STACK_FRAME $0 + SAVE_ALL + movq %rsp,%rcx + call sys_execve + movq %rax, RAX(%rsp) + RESTORE_REST + testq %rax,%rax + jne 1f + jmp int_ret_from_sys_call +1: RESTORE_ARGS + UNFAKE_STACK_FRAME + ret + CFI_ENDPROC +END(kernel_execve) + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(call_softirq) + CFI_STARTPROC + pushq_cfi %rbp + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr),%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + CFI_RESTORE rbp + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 + decl PER_CPU_VAR(irq_count) + ret + CFI_ENDPROC +END(call_softirq) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movq %r15, R15(%rdi) + movq %r14, R14(%rdi) + xchgq %rsi, %rdx + movq %r13, R13(%rdi) + movq %r12, R12(%rdi) + xorl %eax, %eax + movq %rbp, RBP(%rdi) + movq %rbx, RBX(%rdi) + movq (%rsp), %r9 + xchgq %rdx, %rcx + movq %rax, R11(%rdi) + movq %rax, R10(%rdi) + movq %rax, R9(%rdi) + movq %rax, R8(%rdi) + movq %rax, RAX(%rdi) + movq %rax, RCX(%rdi) + movq %rax, RDX(%rdi) + movq %rax, RSI(%rdi) + movq %rax, RDI(%rdi) + movq %rax, ORIG_RAX(%rdi) + movq %r9, RIP(%rdi) + leaq 8(%rsp), %r9 + movq $__KERNEL_CS, CS(%rdi) + movq %rax, EFLAGS(%rdi) + movq %r9, RSP(%rdi) + movq $__KERNEL_DS, SS(%rdi) + jmpq *%rcx + CFI_ENDPROC +END(arch_unwind_init_running) +#endif + +/* + * Some functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" + +paranoidzeroentry_ist debug do_debug DEBUG_STACK +zeroentry nmi do_nmi_callback +paranoidzeroentry_ist int3 do_int3 DEBUG_STACK +paranoiderrorentry stack_segment do_stack_segment +errorentry general_protection do_general_protection +errorentry page_fault do_page_fault +#ifdef CONFIG_KVM_GUEST +errorentry async_page_fault do_async_page_fault +#endif +#ifdef CONFIG_X86_MCE +paranoidzeroentry machine_check *machine_check_vector(%rip) +#endif + +#ifndef CONFIG_XEN + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + + /* ebx: no swapgs flag */ +ENTRY(paranoid_exit) + DEFAULT_FRAME + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore + testl $3,CS(%rsp) + jnz paranoid_userspace +paranoid_swapgs: + TRACE_IRQS_IRETQ 0 + SWAPGS_UNSAFE_STACK + RESTORE_ALL 8 + jmp irq_return +paranoid_restore: + TRACE_IRQS_IRETQ 0 + RESTORE_ALL 8 + jmp irq_return +paranoid_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule + movl %ebx,%edx /* arg3: thread flags */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp paranoid_userspace +paranoid_schedule: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + jmp paranoid_userspace + CFI_ENDPROC +END(paranoid_exit) +#endif + +/* + * Exception entry point. This expects an error code/orig_rax on the stack. + * returns in "no swapgs flag" in %ebx. + */ +ENTRY(error_entry) + XCPT_FRAME start=2 offset=ORIG_RAX-R15+8 + /* oldrax contains error code */ + cld + movq %rdi, RDI+8(%rsp) + movq %rsi, RSI+8(%rsp) + movq %rdx, RDX+8(%rsp) + movq %rcx, RCX+8(%rsp) + movq %rax, RAX+8(%rsp) + movq %r8, R8+8(%rsp) + movq %r9, R9+8(%rsp) + movq %r10, R10+8(%rsp) + movq %r11, R11+8(%rsp) + movq_cfi rbx, RBX+8 + movq %rbp, RBP+8(%rsp) + movq %r12, R12+8(%rsp) + movq %r13, R13+8(%rsp) + movq %r14, R14+8(%rsp) + movq %r15, R15+8(%rsp) +#ifndef CONFIG_XEN + xorl %ebx,%ebx + testl $3,CS+8(%rsp) + je error_kernelspace +error_swapgs: + SWAPGS +error_sti: +#endif + TRACE_IRQS_OFF + ret + +#ifndef CONFIG_XEN +/* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. The exception handlers after iret run with + * kernel gs again, so don't set the user space flag. B stepping K8s + * sometimes report an truncated RIP for IRET exceptions returning to + * compat mode. Check for these here too. + */ +error_kernelspace: + CFI_REL_OFFSET rcx, RCX+8 + incl %ebx + leaq irq_return(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + jmp error_swapgs +#endif + CFI_ENDPROC +END(error_entry) + + +ENTRY(error_exit) + DEFAULT_FRAME + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testb $3,CS-ARGOFFSET(%rsp) + jz retint_kernel + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + jmp retint_restore_args + CFI_ENDPROC +END(error_exit) + + +#define extern # +#include + +.pushsection PER_CPU_BASE_SECTION, "aw", @progbits +in_NMI: .byte 0 +.popsection + +do_nmi_callback: + CFI_STARTPROC + addq $8, %rsp + CFI_ENDPROC + DEFAULT_FRAME + orb $1, PER_CPU_VAR(in_NMI) + js 1f +0: + movb $0x80, PER_CPU_VAR(in_NMI) + call do_nmi + movl $0x80, %eax + cmpxchgb %ah, PER_CPU_VAR(in_NMI) + jne 0b + orl $NMI_MASK,EFLAGS(%rsp) +1: + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_restore_args + CFI_ENDPROC +END(do_nmi_callback) + + +#ifndef CONFIG_IA32_EMULATION +ENTRY(ignore_sysret) + INTR_FRAME + popq_cfi %rcx + CFI_RESTORE rcx + popq_cfi %r11 + CFI_RESTORE r11 + mov $-ENOSYS,%eax + # any non-zero value not having VGCF_in_syscall set will do: + HYPERVISOR_IRET VGCF_i387_valid + CFI_ENDPROC +END(ignore_sysret) +#endif + +/* + * End of kprobes section + */ + .popsection diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 499ceb5..78d0610 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1256,7 +1256,7 @@ ENTRY(arch_unwind_init_running) END(arch_unwind_init_running) #endif -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN zeroentry xen_hypervisor_callback xen_do_hypervisor_callback /* @@ -1356,7 +1356,7 @@ END(xen_failsafe_callback) apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ xen_hvm_callback_vector xen_evtchn_do_upcall -#endif /* CONFIG_XEN */ +#endif /* CONFIG_PARAVIRT_XEN */ /* * Some functions should be protected against kprobes @@ -1366,7 +1366,7 @@ apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment -#ifdef CONFIG_XEN +#ifdef CONFIG_PARAVIRT_XEN zeroentry xen_debug do_debug zeroentry xen_int3 do_int3 errorentry xen_stack_segment do_stack_segment diff --git a/arch/x86/kernel/fixup.c b/arch/x86/kernel/fixup.c new file mode 100644 index 0000000..64cd323 --- /dev/null +++ b/arch/x86/kernel/fixup.c @@ -0,0 +1,89 @@ +/****************************************************************************** + * fixup.c + * + * Binary-rewriting of certain IA32 instructions, on notification by Xen. + * Used to avoid repeated slow emulation of common instructions used by the + * user-space TLS (Thread-Local Storage) libraries. + * + * **** NOTE **** + * Issues with the binary rewriting have caused it to be removed. Instead + * we rely on Xen's emulator to boot the kernel, and then print a banner + * message recommending that the user disables /lib/tls. + * + * Copyright (c) 2004, K A Fraser + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DP(_f, _args...) pr_alert(" " _f "\n" , ## _args ) + +dotraplinkage void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) +{ + static unsigned long printed = 0; + char info[100]; + int i; + + /* Ignore statically-linked init. */ + if (current->tgid == 1) + return; + + VOID(HYPERVISOR_vm_assist(VMASST_CMD_disable, + VMASST_TYPE_4gb_segments_notify)); + + if (test_and_set_bit(0, &printed)) + return; + + sprintf(info, "%s (pid=%d)", current->comm, current->tgid); + + DP(""); + DP("***************************************************************"); + DP("***************************************************************"); + DP("** WARNING: Currently emulating unsupported memory accesses **"); + DP("** in /lib/tls glibc libraries. The emulation is **"); + DP("** slow. To ensure full performance you should **"); + DP("** install a 'xen-friendly' (nosegneg) version of **"); + DP("** the library, or disable tls support by executing **"); + DP("** the following as root: **"); + DP("** mv /lib/tls /lib/tls.disabled **"); + DP("** Offending process: %-38.38s **", info); + DP("***************************************************************"); + DP("***************************************************************"); + DP(""); + + for (i = 5; i > 0; i--) { + touch_softlockup_watchdog(); + printk("Pausing... %d", i); + mdelay(1000); + printk("\b\b\b\b\b\b\b\b\b\b\b\b"); + } + + printk("Continuing...\n\n"); +} + +static int __init fixup_init(void) +{ + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments_notify)); + return 0; +} +__initcall(fixup_init); diff --git a/arch/x86/kernel/head-xen.c b/arch/x86/kernel/head-xen.c new file mode 100644 index 0000000..b15fd60 --- /dev/null +++ b/arch/x86/kernel/head-xen.c @@ -0,0 +1,223 @@ +#include +#include +#include +#include + +#include +#ifndef CONFIG_XEN +#include + +#define BIOS_LOWMEM_KILOBYTES 0x413 + +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + */ +void __init reserve_ebda_region(void) +{ + unsigned int lowmem, ebda_addr; + + /* To determine the position of the EBDA and the */ + /* end of conventional memory, we need to look at */ + /* the BIOS data area. In a paravirtual environment */ + /* that area is absent. We'll just have to assume */ + /* that the paravirt case can handle memory setup */ + /* correctly, without our help. */ + if (paravirt_enabled()) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* Fixup: bios puts an EBDA in the top 64K segment */ + /* of conventional memory, but does not adjust lowmem. */ + if ((lowmem - ebda_addr) <= 0x10000) + lowmem = ebda_addr; + + /* Fixup: bios does not report an EBDA at all. */ + /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */ + if ((ebda_addr == 0) && (lowmem >= 0x9f000)) + lowmem = 0x9f000; + + /* Paranoia: should never happen, but... */ + if ((lowmem == 0) || (lowmem >= 0x100000)) + lowmem = 0x9f000; + + /* reserve all memory between lowmem and the 1MB mark */ + memblock_reserve(lowmem, 0x100000 - lowmem); +} +#else /* CONFIG_XEN */ +#include +#include +#include +#include +#include +#include +#include + +extern void hypervisor_callback(void); +extern void failsafe_callback(void); +extern void nmi(void); + +#ifdef CONFIG_X86_64 +#include +#define CALLBACK_ADDR(fn) ((unsigned long)(fn)) +#else +#define CALLBACK_ADDR(fn) { __KERNEL_CS, (unsigned long)(fn) } +#endif + +unsigned long __initdata xen_initrd_start; + +unsigned long *__read_mostly machine_to_phys_mapping = + (void *)MACH2PHYS_VIRT_START; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned long __read_mostly machine_to_phys_nr; +EXPORT_SYMBOL(machine_to_phys_nr); + +void __init xen_start_kernel(void) +{ + unsigned int i; + struct xen_machphys_mapping mapping; + + xen_setup_features(); + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr = mapping.max_mfn + 1; + } else + machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; +#ifdef CONFIG_X86_32 + WARN_ON(machine_to_phys_mapping + (machine_to_phys_nr - 1) + < machine_to_phys_mapping); +#endif + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = + (unsigned long *)xen_start_info->mfn_list; + + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables)); + + memblock_reserve(ALIGN(__pa_symbol(&_end), PAGE_SIZE), + __pa(xen_start_info->pt_base) + + PFN_PHYS(xen_start_info->nr_pt_frames)); + +#ifdef CONFIG_X86_32 +{ + extern pte_t swapper_pg_fixmap[PTRS_PER_PTE]; + unsigned long addr; + + /* Do an early initialization of the fixmap area */ + make_lowmem_page_readonly(swapper_pg_fixmap, XENFEAT_writable_page_tables); + addr = __fix_to_virt(FIX_EARLYCON_MEM_BASE); + set_pmd(pmd_offset(pud_offset(swapper_pg_dir + pgd_index(addr), + addr), + addr), + __pmd(__pa_symbol(swapper_pg_fixmap) | _PAGE_TABLE)); +} +#else + x86_configure_nx(); + xen_init_pt(); +#endif + +#define __FIXADDR_TOP (-PAGE_SIZE) +#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) +#define FIX_BUG_ON(fix) BUILD_BUG_ON(pmd_index(__fix_to_virt(FIX_##fix)) \ + != pmd_index(__fix_to_virt(FIX_EARLYCON_MEM_BASE))) + FIX_BUG_ON(SHARED_INFO); + FIX_BUG_ON(ISAMAP_BEGIN); + FIX_BUG_ON(ISAMAP_END); +#undef pmd_index +#undef __FIXADDR_TOP + + /* Switch to the real shared_info page, and clear the dummy page. */ + set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info); + HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); + clear_page(empty_zero_page); + + setup_vcpu_info(0); + + /* Set up mapping of lowest 1MB of physical memory. */ + for (i = 0; i < NR_FIX_ISAMAPS; i++) + if (is_initial_xendomain()) + set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); + else + __set_fixmap(FIX_ISAMAP_BEGIN - i, + virt_to_machine(empty_zero_page), + PAGE_KERNEL_RO); + + if (is_initial_xendomain()) { + x86_platform.get_wallclock = mach_get_cmos_time; + x86_platform.set_wallclock = mach_set_rtc_mmss; + + pci_request_acs(); + } else + x86_init.resources.probe_roms = x86_init_noop; +} + +void __init xen_arch_setup(void) +{ + int ret; + static const struct callback_register __initconst event = { + .type = CALLBACKTYPE_event, + .address = CALLBACK_ADDR(hypervisor_callback) + }; + static const struct callback_register __initconst failsafe = { + .type = CALLBACKTYPE_failsafe, + .address = CALLBACK_ADDR(failsafe_callback) + }; +#ifdef CONFIG_X86_64 + static const struct callback_register __initconst syscall = { + .type = CALLBACKTYPE_syscall, + .address = CALLBACK_ADDR(system_call) + }; +#endif + static const struct callback_register __initconst nmi_cb = { + .type = CALLBACKTYPE_nmi, + .address = CALLBACK_ADDR(nmi) + }; + + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &event); + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe); +#ifdef CONFIG_X86_64 + if (ret == 0) + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &syscall); +#endif +#if CONFIG_XEN_COMPAT <= 0x030002 +#ifdef CONFIG_X86_32 + if (ret == -ENOSYS) + ret = HYPERVISOR_set_callbacks( + event.address.cs, event.address.eip, + failsafe.address.cs, failsafe.address.eip); +#else + ret = HYPERVISOR_set_callbacks( + event.address, + failsafe.address, + syscall.address); +#endif +#endif + BUG_ON(ret); + + ret = HYPERVISOR_callback_op(CALLBACKOP_register, &nmi_cb); +#if CONFIG_XEN_COMPAT <= 0x030002 + if (ret == -ENOSYS) { + static struct xennmi_callback __initdata cb = { + .handler_address = (unsigned long)nmi + }; + + HYPERVISOR_nmi_op(XENNMI_register_callback, &cb); + } +#endif +} +#endif /* CONFIG_XEN */ diff --git a/arch/x86/kernel/head32-xen.c b/arch/x86/kernel/head32-xen.c new file mode 100644 index 0000000..fcc893b --- /dev/null +++ b/arch/x86/kernel/head32-xen.c @@ -0,0 +1,103 @@ +/* + * linux/arch/i386/kernel/head32.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2007 Eric Biederman + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static void __init i386_default_early_setup(void) +{ + /* Initialize 32bit specific setup functions */ + x86_init.resources.reserve_resources = i386_reserve_resources; +#ifndef CONFIG_XEN + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +#endif +} + +void __init i386_start_kernel(void) +{ +#ifdef CONFIG_XEN + struct xen_platform_parameters pp; + + WARN_ON(HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_4gb_segments)); + + init_mm.pgd = swapper_pg_dir = (pgd_t *)xen_start_info->pt_base; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) { + hypervisor_virt_start = pp.virt_start; + reserve_top_address(0UL - pp.virt_start); + } + + BUG_ON(pte_index(hypervisor_virt_start)); +#endif + + memblock_reserve(__pa_symbol(&_text), + __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); + +#ifndef CONFIG_XEN +#ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD */ + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); + } +#endif + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_MRST: + x86_mrst_early_setup(); + break; + case X86_SUBARCH_CE4100: + x86_ce4100_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } +#else +#ifdef CONFIG_BLK_DEV_INITRD + BUG_ON(xen_start_info->flags & SIF_MOD_START_PFN); + if (xen_start_info->mod_start) + xen_initrd_start = __pa(xen_start_info->mod_start); +#endif + { + int max_cmdline; + + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + max_cmdline = COMMAND_LINE_SIZE; + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline); + boot_command_line[max_cmdline-1] = '\0'; + } + + i386_default_early_setup(); + xen_start_kernel(); +#endif + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + + start_kernel(); +} diff --git a/arch/x86/kernel/head64-xen.c b/arch/x86/kernel/head64-xen.c new file mode 100644 index 0000000..b2010d8 --- /dev/null +++ b/arch/x86/kernel/head64-xen.c @@ -0,0 +1,146 @@ +/* + * prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * + * Jun Nakajima + * Modified for Xen. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_XEN +static void __init zap_identity_mappings(void) +{ + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + __flush_tlb_all(); +} + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +static void __init clear_bss(void) +{ + memset(__bss_start, 0, + (unsigned long) __bss_stop - (unsigned long) __bss_start); +} +#endif + +static void __init copy_bootdata(char *real_mode_data) +{ +#ifndef CONFIG_XEN + char * command_line; + + memcpy(&boot_params, real_mode_data, sizeof boot_params); + if (boot_params.hdr.cmd_line_ptr) { + command_line = __va(boot_params.hdr.cmd_line_ptr); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + } +#else + int max_cmdline; + + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) + max_cmdline = COMMAND_LINE_SIZE; + memcpy(boot_command_line, xen_start_info->cmd_line, max_cmdline); + boot_command_line[max_cmdline-1] = '\0'; +#endif +} + +#include + +void __init x86_64_start_kernel(char * real_mode_data) +{ + /* + * Build-time sanity checks on the kernel image and module + * area mappings. (these are purely build-time and produce no code) + */ + BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START); + BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE); + BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); + BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0); + BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + + xen_start_info = (struct start_info *)real_mode_data; + xen_start_kernel(); + +#ifndef CONFIG_XEN + /* clear bss before set_intr_gate with early_idt_handler */ + clear_bss(); + + /* Make NULL pointers segfault */ + zap_identity_mappings(); + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { +#ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +#else + set_intr_gate(i, early_idt_handler); +#endif + } + load_idt((const struct desc_ptr *)&idt_descr); +#endif + + if (console_loglevel == 10) + early_printk("Kernel alive\n"); + + xen_switch_pt(); + + x86_64_start_reservations(real_mode_data); +} + +void __init x86_64_start_reservations(char *real_mode_data) +{ + copy_bootdata(__va(real_mode_data)); + + memblock_reserve(__pa_symbol(&_text), + __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); + +#ifdef CONFIG_BLK_DEV_INITRD + /* Reserve INITRD if needed. */ + if (xen_start_info->flags & SIF_MOD_START_PFN) { + reserve_pfn_range(xen_start_info->mod_start, + PFN_UP(xen_start_info->mod_len)); + xen_initrd_start = xen_start_info->mod_start << PAGE_SHIFT; + } else if (xen_start_info->mod_start) + xen_initrd_start = __pa(xen_start_info->mod_start); +#endif + + if (xen_feature(XENFEAT_auto_translated_physmap)) + xen_start_info->mfn_list = ~0UL; + else if (xen_start_info->mfn_list < __START_KERNEL_map) + reserve_pfn_range(xen_start_info->first_p2m_pfn, + xen_start_info->nr_p2m_frames); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + + start_kernel(); +} diff --git a/arch/x86/kernel/head_32-xen.S b/arch/x86/kernel/head_32-xen.S new file mode 100644 index 0000000..c434cef --- /dev/null +++ b/arch/x86/kernel/head_32-xen.S @@ -0,0 +1,220 @@ + + +.text +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +__HEAD +#define VIRT_ENTRY_OFFSET 0x0 +.org VIRT_ENTRY_OFFSET +ENTRY(startup_32) + movl %esi,xen_start_info + cld + + /* Set up the stack pointer */ + movl $(init_thread_union+THREAD_SIZE),%esp + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + XEN_CPUID + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + movl $1,%eax # Use the CPUID instruction to get CPU type + XEN_CPUID + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * The linker can't handle this by relocation. Manually set + * base address in stack canary segment descriptor. + */ + movl $gdt_page,%eax + movl $stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + + # %esi still points to start_info, and no registers + # need to be preserved. + + movl XEN_START_mfn_list(%esi), %ebx + movl $(gdt_page - __PAGE_OFFSET), %eax + shrl $PAGE_SHIFT, %eax + movl (%ebx,%eax,4), %ecx + pushl %ecx # frame number for set_gdt below + + xorl %esi, %esi + xorl %edx, %edx + shldl $PAGE_SHIFT, %ecx, %edx + shll $PAGE_SHIFT, %ecx + orl $_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_DIRTY, %ecx + movl $gdt_page, %ebx + movl $__HYPERVISOR_update_va_mapping, %eax + int $0x82 + + movl $(PAGE_SIZE / 8), %ecx + movl %esp, %ebx + movl $__HYPERVISOR_set_gdt, %eax + int $0x82 + + popl %ecx + + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + + movl $(__KERNEL_STACK_CANARY),%eax + movl %eax,%gs + + cld # gcc2 wants the direction flag cleared at all times + + pushl $0 # fake return address for unwinder + jmp i386_start_kernel + +#define HYPERCALL_PAGE_OFFSET 0x1000 +.org HYPERCALL_PAGE_OFFSET +ENTRY(hypercall_page) + CFI_STARTPROC +.skip 0x1000 + CFI_ENDPROC + +/* + * BSS section + */ +__PAGE_ALIGNED_BSS + .align PAGE_SIZE +ENTRY(swapper_pg_fixmap) + .fill 1024,4,0 +ENTRY(empty_zero_page) + .fill 4096,1,0 + +/* + * This starts the data section. + */ +.data + +#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST +# define XEN_DOM0_CAP 0 +# define XEN_DOM0_CAP_STR "" +#else +# define XEN_DOM0_CAP (1 << XENFEAT_dom0) +# if CONFIG_XEN_COMPAT < 0x040200 +# define XEN_DOM0_CAP_STR "" +# else +# define XEN_DOM0_CAP_STR "|dom0" +# endif +#endif + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoa value + .if (\value) < 0 || (\value) >= 0x10 + utoa (((\value)>>4)&0x0fffffff) + .endif + .if ((\value) & 0xf) < 10 + .byte '0' + ((\value) & 0xf) + .else + .byte 'A' + ((\value) & 0xf) - 10 + .endif +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoa __PAGE_OFFSET + .ascii ",ELF_PADDR_OFFSET=0x" + utoa __PAGE_OFFSET + .ascii ",VIRT_ENTRY=0x" + utoa (__PAGE_OFFSET + LOAD_PHYSICAL_ADDR + VIRT_ENTRY_OFFSET) + .ascii ",HYPERCALL_PAGE=0x" + utoa ((LOAD_PHYSICAL_ADDR+HYPERCALL_PAGE_OFFSET)>>PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|pae_pgdir_above_4gb" + .ascii "|supervisor_mode_kernel" +#ifdef CONFIG_X86_PAE + .ascii ",PAE=yes[extended-cr3]" +#else + .ascii ",PAE=no" +#endif + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .long 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_32) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "writable_page_tables"; + .ascii "|writable_descriptor_tables"; + .ascii "|auto_translated_physmap"; + .ascii "|pae_pgdir_above_4gb"; + .ascii "|supervisor_mode_kernel"; + .asciz XEN_DOM0_CAP_STR) + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long XEN_DOM0_CAP | + (1 << XENFEAT_writable_page_tables) | + (1 << XENFEAT_writable_descriptor_tables) | + (1 << XENFEAT_auto_translated_physmap) | + (1 << XENFEAT_pae_pgdir_above_4gb) | + (1 << XENFEAT_supervisor_mode_kernel)) +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long _PAGE_PRESENT, _PAGE_PRESENT) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) diff --git a/arch/x86/kernel/head_64-xen.S b/arch/x86/kernel/head_64-xen.S new file mode 100644 index 0000000..c8ce8bd --- /dev/null +++ b/arch/x86/kernel/head_64-xen.S @@ -0,0 +1,176 @@ +/* + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * Copyright (C) 2000 Pavel Machek + * Copyright (C) 2000 Karsten Keil + * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman + * Jun Nakajima + * Modified for Xen + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + __HEAD + .code64 + .globl startup_64 +startup_64: + movq $(init_thread_union+THREAD_SIZE-8),%rsp + + /* rsi is pointer to startup info structure. + pass it to C */ + movq %rsi,%rdi + + /* Set up %gs. + * + * The base of %gs always points to the bottom of the irqstack + * union. If the stack protector canary is enabled, it is + * located at %gs:40. Note that, on SMP, the boot cpu uses + * init data section till per cpu areas are set up. + */ + movl $MSR_GS_BASE,%ecx + movq $INIT_PER_CPU_VAR(irq_stack_union),%rax + movq %rax,%rdx + shrq $32,%rdx + wrmsr + + pushq $0 # fake return address + jmp x86_64_start_kernel + +#define NEXT_PAGE(name) \ + .balign PAGE_SIZE; \ +ENTRY(name) + + __PAGE_ALIGNED_BSS +NEXT_PAGE(init_level4_pgt) + .fill 512,8,0 + +NEXT_PAGE(level3_kernel_pgt) + .fill 512,8,0 + + /* + * This is used for vsyscall area mapping as we have a different + * level4 page table for user. + */ +NEXT_PAGE(level3_user_pgt) + .fill 512,8,0 + +NEXT_PAGE(level2_fixmap_pgt) + .fill 512,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 + + .previous +NEXT_PAGE(hypercall_page) + phys_hypercall_page = . - .head.text + CFI_STARTPROC + .rept 0x1000 / 0x20 + .skip 1 /* push %rcx */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx,0 + .skip 2 /* push %r11 */ + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx,0 + .skip 5 /* mov $#,%eax */ + .skip 2 /* syscall */ + .skip 2 /* pop %r11 */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE r11 + .skip 1 /* pop %rcx */ + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx + .align 0x20,0 /* ret */ + .endr + CFI_ENDPROC + +#undef NEXT_PAGE + + __PAGE_ALIGNED_BSS + .align PAGE_SIZE +ENTRY(empty_zero_page) + .skip PAGE_SIZE + +#ifdef CONFIG_XEN_UNPRIVILEGED_GUEST +# define XEN_DOM0_CAP 0 +# define XEN_DOM0_CAP_STR "" +#else +# define XEN_DOM0_CAP (1 << XENFEAT_dom0) +# if CONFIG_XEN_COMPAT < 0x040200 +# define XEN_DOM0_CAP_STR "" +# else +# define XEN_DOM0_CAP_STR "|dom0" +# endif +#endif + +#if CONFIG_XEN_COMPAT <= 0x030002 +/* + * __xen_guest information + */ +.macro utoh value + i = 64 + .rept 16 + i = i - 4 + .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf) + .endr +.endm + +.section __xen_guest + .ascii "GUEST_OS=linux,GUEST_VER=2.6" + .ascii ",XEN_VER=xen-3.0" + .ascii ",VIRT_BASE=0x" + utoh __START_KERNEL_map + .ascii ",ELF_PADDR_OFFSET=0x" + utoh __START_KERNEL_map + .ascii ",VIRT_ENTRY=0x" + utoh (__START_KERNEL_map + __PHYSICAL_START) + .ascii ",HYPERCALL_PAGE=0x" + utoh (phys_hypercall_page >> PAGE_SHIFT) + .ascii ",FEATURES=writable_page_tables" + .ascii "|writable_descriptor_tables" + .ascii "|auto_translated_physmap" + .ascii "|supervisor_mode_kernel" + .ascii ",LOADER=generic" + .byte 0 +#endif /* CONFIG_XEN_COMPAT <= 0x030002 */ + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad __START_KERNEL_map) +#if CONFIG_XEN_COMPAT <= 0x030002 + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad __START_KERNEL_map) +#else + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad 0) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .quad startup_64) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT, _PAGE_PRESENT) + ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1) + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad VMEMMAP_START) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "writable_page_tables"; + .ascii "|writable_descriptor_tables"; + .ascii "|auto_translated_physmap"; + .ascii "|supervisor_mode_kernel"; + .asciz XEN_DOM0_CAP_STR) + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long XEN_DOM0_CAP | + (1 << XENFEAT_writable_page_tables) | + (1 << XENFEAT_writable_descriptor_tables) | + (1 << XENFEAT_auto_translated_physmap) | + (1 << XENFEAT_supervisor_mode_kernel)) + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 43e9ccf..c50f863 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -31,6 +31,7 @@ union thread_union init_thread_union __init_task_data = struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_task); +#ifndef CONFIG_X86_NO_TSS /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned @@ -39,4 +40,4 @@ EXPORT_SYMBOL(init_task); * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; - +#endif diff --git a/arch/x86/kernel/ioport-xen.c b/arch/x86/kernel/ioport-xen.c new file mode 100644 index 0000000..3cb400e --- /dev/null +++ b/arch/x86/kernel/ioport-xen.c @@ -0,0 +1,84 @@ +/* + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. 32/64 bits code unification by Miguel Botón. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct *t = ¤t->thread; + struct physdev_set_iobitmap set_iobitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + + set_xen_guest_handle(set_iobitmap.bitmap, (char *)bitmap); + set_iobitmap.nr_ports = IO_BITMAP_BITS; + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + } + + if (turn_on) + bitmap_clear(t->io_bitmap_ptr, from, num); + else + bitmap_set(t->io_bitmap_ptr, from, num); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + */ +long sys_iopl(unsigned int level, struct pt_regs *regs) +{ + struct thread_struct *t = ¤t->thread; + unsigned int old = t->iopl >> 12; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + t->iopl = level << 12; + set_iopl_mask(t->iopl); + + return 0; +} diff --git a/arch/x86/kernel/irq-xen.c b/arch/x86/kernel/irq-xen.c new file mode 100644 index 0000000..e12c762 --- /dev/null +++ b/arch/x86/kernel/irq-xen.c @@ -0,0 +1,350 @@ +/* + * Common interrupt code for 32 and 64 bit + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifndef CONFIG_XEN +atomic_t irq_err_count; + +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; +#endif + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); + +#ifndef CONFIG_XEN + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + ack_APIC_irq(); +#endif +} + +#define irq_stats(x) (&per_cpu(irq_stat, x)) +/* + * /proc/interrupts printing for arch specific interrupts + */ +int arch_show_interrupts(struct seq_file *p, int prec) +{ + int j; + + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC +#ifndef CONFIG_XEN + seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); + + seq_printf(p, "%*s: ", prec, "SPU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); + seq_printf(p, "%*s: ", prec, "PMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance monitoring interrupts\n"); +#endif + seq_printf(p, "%*s: ", prec, "IWI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_printf(p, " IRQ work interrupts\n"); +#ifndef CONFIG_XEN + seq_printf(p, "%*s: ", prec, "RTR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); + seq_printf(p, " APIC ICR read retries\n"); +#endif +#endif +#ifndef CONFIG_XEN + if (x86_platform_ipi_callback) { + seq_printf(p, "%*s: ", prec, "PLT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); + seq_printf(p, " Platform interrupts\n"); + } +#endif +#ifdef CONFIG_SMP + seq_printf(p, "%*s: ", prec, "RES"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "%*s: ", prec, "CAL"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); +#ifndef CONFIG_XEN + seq_printf(p, "%*s: ", prec, "TLB"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#else + seq_printf(p, "%*s: ", prec, "LCK"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_lock_count); + seq_printf(p, " Spinlock wakeups\n"); +#endif +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + seq_printf(p, "%*s: ", prec, "TRM"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); +#endif +#ifndef CONFIG_XEN + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif +#endif + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->irq_spurious_count; + sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; + sum += irq_stats(cpu)->icr_read_retry_count; +#endif +#ifndef CONFIG_XEN + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; +#endif +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; +#ifndef CONFIG_XEN + sum += irq_stats(cpu)->irq_tlb_count; +#else + sum += irq_stats(cpu)->irq_lock_count; +#endif +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + sum += irq_stats(cpu)->irq_thermal_count; +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#ifdef CONFIG_X86_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ +#ifndef CONFIG_XEN + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +#else + return 0; +#endif +} + + +#ifndef CONFIG_XEN +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* high bit used in ret_from_ code */ + unsigned vector = ~regs->orig_ax; + unsigned irq; + + irq_enter(); + exit_idle(); + + irq = __this_cpu_read(vector_irq[vector]); + + if (!handle_irq(irq, regs)) { + ack_APIC_irq(); + + if (printk_ratelimit()) + pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), vector, irq); + } + + irq_exit(); + + set_irq_regs(old_regs); + return 1; +} + +/* + * Handler for X86_PLATFORM_IPI_VECTOR. + */ +void smp_x86_platform_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + irq_enter(); + + exit_idle(); + + inc_irq_stat(x86_platform_ipis); + + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); + + irq_exit(); + + set_irq_regs(old_regs); +} +#endif + +#ifdef CONFIG_HOTPLUG_CPU +#include +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq; + static int warned; + struct irq_desc *desc; + struct irq_data *data; + struct irq_chip *chip; + static DECLARE_BITMAP(irqs_used, NR_IRQS); + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + raw_spin_lock(&desc->lock); + + data = irq_desc_get_irq_data(desc); + affinity = data->affinity; + if (!irq_has_action(irq) || irqd_is_per_cpu(data) || + cpumask_subset(affinity, cpu_online_mask)) { + raw_spin_unlock(&desc->lock); + continue; + } + + if (cpumask_test_cpu(smp_processor_id(), affinity)) + __set_bit(irq, irqs_used); + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + chip = irq_data_get_irq_chip(data); + if (!irqd_can_move_in_process_context(data) && chip->irq_mask) + chip->irq_mask(data); + + if (chip->irq_set_affinity) + chip->irq_set_affinity(data, affinity, true); + else if (data->chip != &no_irq_chip && !(warned++)) + set_affinity = 0; + + if (!irqd_can_move_in_process_context(data) && + !irqd_irq_disabled(data) && chip->irq_unmask) + chip->irq_unmask(data); + + raw_spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + /*printk("Broke affinity for irq %i\n", irq)*/; + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + for_each_irq_desc(irq, desc) { + if (!__test_and_clear_bit(irq, irqs_used)) + continue; + + if (xen_test_irq_pending(irq)) { + desc = irq_to_desc(irq); + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + raw_spin_lock(&desc->lock); + if (chip->irq_retrigger) + chip->irq_retrigger(data); + raw_spin_unlock(&desc->lock); + } + } +} +#endif diff --git a/arch/x86/kernel/irq_work-xen.c b/arch/x86/kernel/irq_work-xen.c new file mode 100644 index 0000000..851414e --- /dev/null +++ b/arch/x86/kernel/irq_work-xen.c @@ -0,0 +1,21 @@ +/* + * x86/Xen specific code for irq_work + */ + +#include +#include +#include +#include + +#ifdef CONFIG_SMP +void smp_irq_work_interrupt(struct pt_regs *regs) +{ + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); +} + +void arch_irq_work_raise(void) +{ + xen_send_IPI_self(IRQ_WORK_VECTOR); +} +#endif diff --git a/arch/x86/kernel/ldt-xen.c b/arch/x86/kernel/ldt-xen.c new file mode 100644 index 0000000..5d56e69 --- /dev/null +++ b/arch/x86/kernel/ldt-xen.c @@ -0,0 +1,272 @@ +/* + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP +static void flush_ldt(void *current_mm) +{ + if (current->active_mm == current_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt, *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + else + newldt = (void *)__get_free_page(GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + preempt_disable(); +#endif + make_pages_readonly(newldt, + (mincount * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + load_LDT(pc); +#ifdef CONFIG_SMP + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) + smp_call_function(flush_ldt, current->mm, 1); + preempt_enable(); +#endif + } + if (oldsize) { + make_pages_writable(oldldt, + (oldsize * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + put_page(virt_to_page(oldldt)); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + make_pages_readonly(new->ldt, + (new->size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct *old_mm; + int retval = 0; + + memset(&mm->context, 0, sizeof(mm->context)); + mutex_init(&mm->context.lock); + old_mm = current->mm; + if (old_mm) + mm->context.vdso = old_mm->context.vdso; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); + retval = copy_ldt(&mm->context, &old_mm->context); + mutex_unlock(&old_mm->context.lock); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + /* CHECKME: Can this ever happen ? */ + if (mm == current->active_mm) + clear_LDT(); + make_pages_writable(mm->context.ldt, + (mm->context.size * LDT_ENTRY_SIZE) / PAGE_SIZE, + XENFEAT_writable_descriptor_tables); + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + put_page(virt_to_page(mm->context.ldt)); + mm->context.size = 0; + } +} + +static int read_ldt(void __user *ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct *mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; + + mutex_lock(&mm->context.lock); + size = mm->context.size * LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + mutex_unlock(&mm->context.lock); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr + size, bytecount - size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user *ptr, unsigned long bytecount) +{ + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct *mm = current->mm; + struct desc_struct ldt; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + mutex_lock(&mm->context.lock); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + memset(&ldt, 0, sizeof(ldt)); + goto install; + } + } + + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; + + /* Install the new entry ... */ +install: + error = write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); + +out_unlock: + mutex_unlock(&mm->context.lock); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index a3fa43b..6d5a5d9 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -27,47 +27,9 @@ #include #include -static void set_idt(void *newidt, __u16 limit) -{ - struct desc_ptr curidt; - - /* ia32 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - load_idt(&curidt); -} - - -static void set_gdt(void *newgdt, __u16 limit) -{ - struct desc_ptr curgdt; - - /* ia32 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - load_gdt(&curgdt); -} - -static void load_segments(void) -{ -#define __STR(X) #X -#define STR(X) __STR(X) - - __asm__ __volatile__ ( - "\tljmp $"STR(__KERNEL_CS)",$1f\n" - "\t1:\n" - "\tmovl $"STR(__KERNEL_DS)",%%eax\n" - "\tmovl %%eax,%%ds\n" - "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" - "\tmovl %%eax,%%ss\n" - : : : "eax", "memory"); -#undef STR -#undef __STR -} +#ifdef CONFIG_XEN +#include +#endif static void machine_kexec_free_page_tables(struct kimage *image) { @@ -84,6 +46,17 @@ static int machine_kexec_alloc_page_tables(struct kimage *image) { image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); #ifdef CONFIG_X86_PAE +#ifdef CONFIG_XEN /* machine address must fit into xki->page_list[PA_PGD] */ + if (image->arch.pgd) { + struct page *pg = virt_to_page(image->arch.pgd); + + if (xen_limit_pages_to_max_mfn(pg, 0, BITS_PER_LONG) < 0) { + image->arch.pgd = NULL; + __free_page(pg); + return -ENOMEM; + } + } +#endif image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); #endif @@ -139,6 +112,38 @@ static void machine_kexec_prepare_page_tables(struct kimage *image) __pa(control_page), __pa(control_page)); } +#ifdef CONFIG_XEN + +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) + +#if PAGES_NR > KEXEC_XEN_NO_PAGES +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break +#endif + +#if PA_CONTROL_PAGE != 0 +#error PA_CONTROL_PAGE is non zero - Xen support will break +#endif + +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) +{ + void *control_page; + + memset(xki->page_list, 0, sizeof(xki->page_list)); + + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); + xki->page_list[PA_PGD] = __ma(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page); +} + +#include "machine_kexec_xen.c" + +#endif /* CONFIG_XEN */ + /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -176,6 +181,7 @@ void machine_kexec_cleanup(struct kimage *image) machine_kexec_free_page_tables(image); } +#ifndef CONFIG_XEN /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. @@ -228,24 +234,6 @@ void machine_kexec(struct kimage *image) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* - * The segment registers are funny things, they have both a - * visible and an invisible part. Whenever the visible part is - * set to a specific selector, the invisible part is loaded - * with from a table in memory. At no other time is the - * descriptor table in memory accessed. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); - /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, (unsigned long)page_list, @@ -259,6 +247,7 @@ void machine_kexec(struct kimage *image) __ftrace_enabled_restore(save_ftrace_enabled); } +#endif void arch_crash_save_vmcoreinfo(void) { diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index b3ea9db..d8d77db 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -21,6 +21,101 @@ #include #include +#ifdef CONFIG_XEN + +/* In the case of Xen, override hypervisor functions to be able to create + * a regular identity mapping page table... + */ + +#include +#include + +#define x__pmd(x) ((pmd_t) { (x) } ) +#define x__pud(x) ((pud_t) { (x) } ) +#define x__pgd(x) ((pgd_t) { (x) } ) + +#define x_pmd_val(x) ((x).pmd) +#define x_pud_val(x) ((x).pud) +#define x_pgd_val(x) ((x).pgd) + +static inline void x_set_pmd(pmd_t *dst, pmd_t val) +{ + x_pmd_val(*dst) = x_pmd_val(val); +} + +static inline void x_set_pud(pud_t *dst, pud_t val) +{ + x_pud_val(*dst) = phys_to_machine(x_pud_val(val)); +} + +static inline void x_pud_clear (pud_t *pud) +{ + x_pud_val(*pud) = 0; +} + +static inline void x_set_pgd(pgd_t *dst, pgd_t val) +{ + x_pgd_val(*dst) = phys_to_machine(x_pgd_val(val)); +} + +static inline void x_pgd_clear (pgd_t * pgd) +{ + x_pgd_val(*pgd) = 0; +} + +#define X__PAGE_KERNEL_LARGE_EXEC \ + _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_PSE +#define X_KERNPG_TABLE _PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY + +#define __ma(x) (pfn_to_mfn(__pa((x)) >> PAGE_SHIFT) << PAGE_SHIFT) + +#if PAGES_NR > KEXEC_XEN_NO_PAGES +#error PAGES_NR is greater than KEXEC_XEN_NO_PAGES - Xen support will break +#endif + +#if PA_CONTROL_PAGE != 0 +#error PA_CONTROL_PAGE is non zero - Xen support will break +#endif + +void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) +{ + void *control_page; + void *table_page; + + memset(xki->page_list, 0, sizeof(xki->page_list)); + + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + table_page = page_address(image->control_code_page); + + xki->page_list[PA_CONTROL_PAGE] = __ma(control_page); + xki->page_list[PA_TABLE_PAGE] = __ma(table_page); + + if (image->type == KEXEC_TYPE_DEFAULT) + xki->page_list[PA_SWAP_PAGE] = page_to_phys(image->swap_page); +} + +#include "machine_kexec_xen.c" + +#else /* CONFIG_XEN */ + +#define x__pmd(x) __pmd(x) +#define x__pud(x) __pud(x) +#define x__pgd(x) __pgd(x) + +#define x_set_pmd(x, y) set_pmd(x, y) +#define x_set_pud(x, y) set_pud(x, y) +#define x_set_pgd(x, y) set_pgd(x, y) + +#define x_pud_clear(x) pud_clear(x) +#define x_pgd_clear(x) pgd_clear(x) + +#define X__PAGE_KERNEL_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC +#define X_KERNPG_TABLE _KERNPG_TABLE + +#endif /* CONFIG_XEN */ + static int init_one_level2_page(struct kimage *image, pgd_t *pgd, unsigned long addr) { @@ -50,7 +145,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) - set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + x_set_pmd(pmd, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); result = 0; out: return result; @@ -63,7 +158,7 @@ static void init_level2_page(pmd_t *level2p, unsigned long addr) addr &= PAGE_MASK; end_addr = addr + PUD_SIZE; while (addr < end_addr) { - set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + x_set_pmd(level2p++, x__pmd(addr | X__PAGE_KERNEL_LARGE_EXEC)); addr += PMD_SIZE; } } @@ -88,12 +183,12 @@ static int init_level3_page(struct kimage *image, pud_t *level3p, } level2p = (pmd_t *)page_address(page); init_level2_page(level2p, addr); - set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE)); + x_set_pud(level3p++, x__pud(__pa(level2p) | X_KERNPG_TABLE)); addr += PUD_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - pud_clear(level3p++); + x_pud_clear(level3p++); addr += PUD_SIZE; } out: @@ -123,12 +218,12 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p, result = init_level3_page(image, level3p, addr, last_addr); if (result) goto out; - set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE)); + x_set_pgd(level4p++, x__pgd(__pa(level3p) | X_KERNPG_TABLE)); addr += PGDIR_SIZE; } /* clear the unused entries */ while (addr < end_addr) { - pgd_clear(level4p++); + x_pgd_clear(level4p++); addr += PGDIR_SIZE; } out: @@ -189,8 +284,14 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { pgd_t *level4p; int result; + unsigned long x_max_pfn = max_pfn; + +#ifdef CONFIG_XEN + x_max_pfn = HYPERVISOR_memory_op(XENMEM_maximum_ram_page, NULL); +#endif + level4p = (pgd_t *)__va(start_pgtable); - result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT); + result = init_level4_page(image, level4p, 0, x_max_pfn << PAGE_SHIFT); if (result) return result; /* @@ -203,47 +304,6 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) return init_transition_pgtable(image, level4p); } -static void set_idt(void *newidt, u16 limit) -{ - struct desc_ptr curidt; - - /* x86-64 supports unaliged loads & stores */ - curidt.size = limit; - curidt.address = (unsigned long)newidt; - - __asm__ __volatile__ ( - "lidtq %0\n" - : : "m" (curidt) - ); -}; - - -static void set_gdt(void *newgdt, u16 limit) -{ - struct desc_ptr curgdt; - - /* x86-64 supports unaligned loads & stores */ - curgdt.size = limit; - curgdt.address = (unsigned long)newgdt; - - __asm__ __volatile__ ( - "lgdtq %0\n" - : : "m" (curgdt) - ); -}; - -static void load_segments(void) -{ - __asm__ __volatile__ ( - "\tmovl %0,%%ds\n" - "\tmovl %0,%%es\n" - "\tmovl %0,%%ss\n" - "\tmovl %0,%%fs\n" - "\tmovl %0,%%gs\n" - : : "a" (__KERNEL_DS) : "memory" - ); -} - int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -265,6 +325,7 @@ void machine_kexec_cleanup(struct kimage *image) free_transition_pgtable(image); } +#ifndef CONFIG_XEN /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. @@ -311,24 +372,6 @@ void machine_kexec(struct kimage *image) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); - /* - * The segment registers are funny things, they have both a - * visible and an invisible part. Whenever the visible part is - * set to a specific selector, the invisible part is loaded - * with from a table in memory. At no other time is the - * descriptor table in memory accessed. - * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. - */ - load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - set_gdt(phys_to_virt(0), 0); - set_idt(phys_to_virt(0), 0); - /* now call it */ image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, @@ -342,10 +385,13 @@ void machine_kexec(struct kimage *image) __ftrace_enabled_restore(save_ftrace_enabled); } +#endif void arch_crash_save_vmcoreinfo(void) { +#ifndef CONFIG_XEN /* could really be CONFIG_RELOCATABLE */ VMCOREINFO_SYMBOL(phys_base); +#endif VMCOREINFO_SYMBOL(init_level4_pgt); #ifdef CONFIG_NUMA diff --git a/arch/x86/kernel/machine_kexec_xen.c b/arch/x86/kernel/machine_kexec_xen.c new file mode 100644 index 0000000..b171ce6 --- /dev/null +++ b/arch/x86/kernel/machine_kexec_xen.c @@ -0,0 +1,29 @@ +int machine_kexec_setup_resource(struct resource *hypervisor, + struct resource *phys_cpu) +{ + /* The per-cpu crash note resources belong to the hypervisor resource */ + insert_resource(hypervisor, phys_cpu); + if (!phys_cpu->parent) /* outside of hypervisor range */ + insert_resource(&iomem_resource, phys_cpu); + + return 0; +} + +int __init machine_kexec_setup_resources(struct resource *hypervisor, + struct resource *phys_cpus, + int nr_phys_cpus) +{ + unsigned int k; + + insert_resource(&iomem_resource, hypervisor); + if (crashk_res.end > crashk_res.start) + insert_resource(&iomem_resource, &crashk_res); + + for (k = 0; k < nr_phys_cpus; k++) + machine_kexec_setup_resource(hypervisor, phys_cpus + k); + + return xen_create_contiguous_region((unsigned long)&vmcoreinfo_note, + get_order(sizeof(vmcoreinfo_note)), + BITS_PER_LONG); + +} diff --git a/arch/x86/kernel/microcode_core-xen.c b/arch/x86/kernel/microcode_core-xen.c new file mode 100644 index 0000000..4416e2d --- /dev/null +++ b/arch/x86/kernel/microcode_core-xen.c @@ -0,0 +1,299 @@ +/* + * CPU Microcode Update Driver for Linux on Xen + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +static int verbose; +module_param(verbose, int, 0644); + +#define MICROCODE_VERSION "2.00-xen" + +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ +static DEFINE_MUTEX(microcode_mutex); + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *ubuf, size_t len) +{ + int err; + void *kbuf; + + kbuf = vmalloc(len); + if (!kbuf) + return -ENOMEM; + + if (copy_from_user(kbuf, ubuf, len) == 0) { + struct xen_platform_op op; + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, kbuf); + op.u.microcode.length = len; + err = HYPERVISOR_platform_op(&op); + } else + err = -EFAULT; + + vfree(kbuf); + + return err; +} + +static int microcode_open(struct inode *inode, struct file *file) +{ + return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret = -EINVAL; + + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("too much data (max %ld pages)\n", totalram_pages); + return ret; + } + + mutex_lock(µcode_mutex); + + if (do_microcode_update(buf, len) == 0) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, + .llseek = no_llseek, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .nodename = "cpu/microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + if (!is_initial_xendomain()) + return -ENODEV; + + error = misc_register(µcode_dev); + if (error) { + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void __exit microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int request_microcode(const char *name) +{ + const struct firmware *firmware; + int error; + struct xen_platform_op op; + + error = request_firmware(&firmware, name, µcode_pdev->dev); + if (error) { + pr_debug("microcode: data file %s load failed\n", name); + return error; + } + + op.cmd = XENPF_microcode_update; + set_xen_guest_handle(op.u.microcode.data, firmware->data); + op.u.microcode.length = firmware->size; + error = HYPERVISOR_platform_op(&op); + + release_firmware(firmware); + + if (error) + pr_debug("ucode load failed\n"); + + return error; +} + +static const char amd_uc_name[] = "amd-ucode/microcode_amd.bin"; +static const char amd_uc_fmt[] = "amd-ucode/microcode_amd_fam%x.bin"; +static const char intel_uc_fmt[] = "intel-ucode/%02x-%02x-%02x"; + +static int ucode_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct xen_platform_op op; + char buf[36]; + const char *uc_name = buf; + + switch (action) { + case CPU_ONLINE: + op.cmd = XENPF_get_cpu_version; + op.u.pcpu_version.xen_cpuid = cpu; + if (HYPERVISOR_platform_op(&op)) + break; + if (op.u.pcpu_version.family == boot_cpu_data.x86 + && op.u.pcpu_version.model == boot_cpu_data.x86_model + && op.u.pcpu_version.stepping == boot_cpu_data.x86_mask) + break; + if (strncmp(op.u.pcpu_version.vendor_id, + "GenuineIntel", 12) == 0) + snprintf(buf, sizeof(buf), intel_uc_fmt, + op.u.pcpu_version.family, + op.u.pcpu_version.model, + op.u.pcpu_version.stepping); + else if (strncmp(op.u.pcpu_version.vendor_id, + "AuthenicAMD", 12) == 0) { + if (op.u.pcpu_version.family >= 0x15) + snprintf(buf, sizeof(buf), amd_uc_fmt, + op.u.pcpu_version.family); + else + uc_name = amd_uc_name; + } else + break; + request_microcode(uc_name); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block ucode_cpu_notifier = { + .notifier_call = ucode_cpu_callback +}; + +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id microcode_id[] = { + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, + { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, + {} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif + +static int __init microcode_init(void) +{ + const struct cpuinfo_x86 *c = &boot_cpu_data; + char buf[36]; + const char *fw_name = buf; + int error; + + if (c->x86_vendor == X86_VENDOR_INTEL) + snprintf(buf, sizeof(buf), intel_uc_fmt, + c->x86, c->x86_model, c->x86_mask); + else if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86 >= 0x15) + snprintf(buf, sizeof(buf), amd_uc_fmt, c->x86); + else + fw_name = amd_uc_name; + } else { + pr_err("no support for this CPU vendor\n"); + return -ENODEV; + } + + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) + return PTR_ERR(microcode_pdev); + + request_microcode(fw_name); + + error = microcode_dev_init(); + if (error) { + platform_device_unregister(microcode_pdev); + return error; + } + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION + " , Peter Oruba\n"); + + error = register_pcpu_notifier(&ucode_cpu_notifier); + if (error) + pr_warn("pCPU notifier registration failed (%d)\n", error); + + return 0; +} +module_init(microcode_init); + +static void __exit microcode_exit(void) +{ + unregister_pcpu_notifier(&ucode_cpu_notifier); + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} +module_exit(microcode_exit); diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index ac861b8..2c88c6a 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -205,12 +205,20 @@ void __cpuinit fam10h_check_enable_mmcfg(void) return; } +#ifndef CONFIG_XEN printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); val &= ~((FAM10H_MMIO_CONF_BASE_MASK< + * (c) 1998, 1999, 2000, 2009 Ingo Molnar + * (c) 2008 Alexey Starikovskiy + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void *_bus_to_virt(unsigned long ma) +{ + return is_ISA_range(ma, ma) ? isa_bus_to_virt(ma) : bus_to_virt(ma); +} + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +#ifndef CONFIG_XEN +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} +#endif + +static void __init MP_processor_info(struct mpc_cpu *m) +{ +#ifndef CONFIG_XEN + int apicid; + char *bootup_cpu = ""; + + if (!(m->cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } + + apicid = x86_init.mpparse.mpc_apic_id(m); + + if (m->cpuflag & CPU_BOOTPROCESSOR) { + bootup_cpu = " (Bootup-CPU)"; + boot_cpu_physical_apicid = m->apicid; + } + + printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu); + generic_processor_info(apicid, m->apicver); +#else /* CONFIG_XEN */ + num_processors++; +#endif +} + +#ifdef CONFIG_X86_IO_APIC +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) +{ + memcpy(str, m->bustype, 6); + str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} + +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); + +#if MAX_MP_BUSSES < 256 + if (m->busid >= MAX_MP_BUSSES) { + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " + " is too large, max. supported is %d\n", + m->busid, str, MAX_MP_BUSSES - 1); + return; + } +#endif + + set_bit(m->busid, mp_bus_not_pci); + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + mp_bus_id_to_type[m->busid] = MP_BUS_ISA; +#endif + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); + + clear_bit(m->busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + mp_bus_id_to_type[m->busid] = MP_BUS_PCI; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_MCA; +#endif + } else + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); +} + +static void __init MP_ioapic_info(struct mpc_ioapic *m) +{ + if (m->flags & MPC_APIC_USABLE) + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); +} + +static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) +{ + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + mp_irq->irqtype, mp_irq->irqflag & 3, + (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, + mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); +} + +#else /* CONFIG_X86_IO_APIC */ +static inline void __init MP_bus_info(struct mpc_bus *m) {} +static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} +#endif /* CONFIG_X86_IO_APIC */ + +static void __init MP_lintsrc_info(struct mpc_lintsrc *m) +{ + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, + m->srcbusirq, m->destapic, m->destapiclint); +} + +/* + * Read/parse the MPC + */ +static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) +{ + + if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { + printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", + mpc->signature[0], mpc->signature[1], + mpc->signature[2], mpc->signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc, mpc->length)) { + printk(KERN_ERR "MPTABLE: checksum error!\n"); + return 0; + } + if (mpc->spec != 0x01 && mpc->spec != 0x04) { + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", + mpc->spec); + return 0; + } + if (!mpc->lapic) { + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); + return 0; + } + memcpy(oem, mpc->oem, 8); + oem[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); + + memcpy(str, mpc->productid, 12); + str[12] = 0; + + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); + +#ifndef CONFIG_XEN + printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic); +#endif + + return 1; +} + +static void skip_entry(unsigned char **ptr, int *count, int size) +{ + *ptr += size; + *count += size; +} + +static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) +{ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" + "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->length, 1); +} + +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + +static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + +#ifndef CONFIG_XEN +#ifdef CONFIG_X86_32 + generic_mps_oem_check(mpc, oem, str); +#endif + /* Initialize the lapic mapping */ + if (!acpi_lapic) + register_lapic_address(mpc->lapic); +#endif + + if (early) + return 1; + + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); + + /* + * Now process the configuration blocks. + */ + x86_init.mpparse.mpc_record(0); + + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info((struct mpc_cpu *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + MP_bus_info((struct mpc_bus *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + MP_ioapic_info((struct mpc_ioapic *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + mp_save_irq((struct mpc_intsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + MP_lintsrc_info((struct mpc_lintsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + count = mpc->length; + break; + } + x86_init.mpparse.mpc_record(1); + } + + if (!num_processors) + printk(KERN_ERR "MPTABLE: no processors registered!\n"); + return num_processors; +} + +#ifdef CONFIG_X86_IO_APIC + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.type = MP_INTSRC; + intsrc.irqflag = 0; /* conforming */ + intsrc.srcbus = 0; + intsrc.dstapic = mpc_ioapic_id(0); + + intsrc.irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... " + "falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || + ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... " + "not using ELCR\n"); + else { + printk(KERN_INFO + "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.irqflag = 13; + else + intsrc.irqflag = 0; + } + + intsrc.srcbusirq = i; + intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + mp_save_irq(&intsrc); + } + + intsrc.irqtype = mp_ExtINT; + intsrc.srcbusirq = 0; + intsrc.dstirq = 0; /* 8259A to INTIN0 */ + mp_save_irq(&intsrc); +} + + +static void __init construct_ioapic_table(int mpc_default_type) +{ + struct mpc_ioapic ioapic; + struct mpc_bus bus; + + bus.type = MP_BUS; + bus.busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.busid = 1; + memcpy(bus.bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); +} +#else +static inline void __init construct_ioapic_table(int mpc_default_type) { } +#endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_cpu processor; + struct mpc_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + +#ifndef CONFIG_XEN + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; +#endif + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.cpuflag = CPU_ENABLED; + processor.cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.featureflag = boot_cpu_data.x86_capability[0]; + processor.reserved[0] = 0; + processor.reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.apicid = i; + MP_processor_info(&processor); + } + + construct_ioapic_table(mpc_default_type); + + lintsrc.type = MP_LINTSRC; + lintsrc.irqflag = 0; /* conforming */ + lintsrc.srcbusid = 0; + lintsrc.srcbusirq = 0; + lintsrc.destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.irqtype = linttypes[i]; + lintsrc.destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct mpf_intel *mpf_found; + +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + +static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) +{ + struct mpc_table *mpc; + unsigned long size; + + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(mpc, early)) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 0; +#endif + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" + "... disabling SMP support. (tell your hw vendor)\n"); + early_iounmap(mpc, size); + return -1; + } + early_iounmap(mpc, size); + + if (early) + return -1; + +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " + "using default mptable. (tell your hw vendor)\n"); + + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + + return 0; +} + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init default_get_smp_config(unsigned int early) +{ + struct mpf_intel *mpf = mpf_found; + + if (!mpf) + return; + +#ifdef CONFIG_XEN + BUG_ON(early); +#define early 0 +#endif + + if (acpi_lapic && early) + return; + + /* + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table + */ + if (acpi_lapic && acpi_ioapic) + return; + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", + mpf->specification); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) && !defined(CONFIG_XEN) + if (mpf->feature2 & (1 << 7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } +#endif + /* + * Now see if we need to read further. + */ + if (mpf->feature1 != 0) { +#ifndef CONFIG_XEN + if (early) { + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + return; + } +#endif + + printk(KERN_INFO "Default MP configuration #%d\n", + mpf->feature1); + construct_default_ISA_mptable(mpf->feature1); + + } else if (mpf->physptr) { + if (check_physptr(mpf, early)) + return; + } else + BUG(); + + if (!early) + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +#undef early +} + +#ifndef CONFIG_XEN +static void __init smp_reserve_memory(struct mpf_intel *mpf) +{ + memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); +} +#endif + +static int __init smp_scan_config(unsigned long base, unsigned long length) +{ + unsigned int *bp = _bus_to_virt(base); + struct mpf_intel *mpf; +#ifndef CONFIG_XEN + unsigned long mem; +#endif + + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", + bp, length); + BUILD_BUG_ON(sizeof(*mpf) != 16); + + while (length > 0) { + mpf = (struct mpf_intel *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->specification == 1) + || (mpf->specification == 4))) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; +#endif + mpf_found = mpf; + +#ifndef CONFIG_XEN + printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", + mpf, (u64)virt_to_phys(mpf)); + + mem = virt_to_phys(mpf); + memblock_reserve(mem, sizeof(*mpf)); + if (mpf->physptr) + smp_reserve_memory(mpf); +#else + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, ((void *)bp - _bus_to_virt(base)) + base); +#endif + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init default_find_smp_config(void) +{ +#ifndef CONFIG_XEN + unsigned int address; +#endif + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0, 0x400) || + smp_scan_config(639 * 0x400, 0x400) || + smp_scan_config(0xF0000, 0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + +#ifndef CONFIG_XEN + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +static u8 __initdata irq_used[MAX_IRQ_SOURCES]; + +static int __init get_MP_intsrc_index(struct mpc_intsrc *m) +{ + int i; + + if (m->irqtype != mp_INT) + return 0; + + if (m->irqflag != 0x0f) + return 0; + + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (mp_irqs[i].srcbus != m->srcbus) + continue; + if (mp_irqs[i].srcbusirq != m->srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ + return -2; + } + irq_used[i] = 1; + return i; + } + + /* not found */ + return -1; +} + +#define SPARE_SLOT_NUM 20 + +static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; + +static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) +{ + int i; + + apic_printk(APIC_VERBOSE, "OLD "); + print_mp_irq_info(m); + + i = get_MP_intsrc_index(m); + if (i > 0) { + memcpy(m, &mp_irqs[i], sizeof(*m)); + apic_printk(APIC_VERBOSE, "NEW "); + print_mp_irq_info(&mp_irqs[i]); + return; + } + if (!i) { + /* legacy, do nothing */ + return; + } + if (*nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) are invalid entries, + * we need to use the slot later + */ + m_spare[*nr_m_spare] = m; + *nr_m_spare += 1; + } +} + +static int __init +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) +{ + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; + } + + return 0; +} +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ + +static int __init replace_intsrc_all(struct mpc_table *mpc, + unsigned long mpc_new_phys, + unsigned long mpc_new_length) +{ +#ifdef CONFIG_X86_IO_APIC + int i; +#endif + int count = sizeof(*mpc); + int nr_m_spare = 0; + unsigned char *mpt = ((unsigned char *)mpc) + count; + + printk(KERN_INFO "mpc_length %x\n", mpc->length); + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + goto out; + } + } + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < mp_irq_entries; i++) { + if (irq_used[i]) + continue; + + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (nr_m_spare > 0) { + apic_printk(APIC_VERBOSE, "*NEW* found\n"); + nr_m_spare--; + memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; + count += sizeof(struct mpc_intsrc); + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) + goto out; + memcpy(m, &mp_irqs[i], sizeof(*m)); + mpc->length = count; + mpt += sizeof(struct mpc_intsrc); + } + print_mp_irq_info(&mp_irqs[i]); + } +#endif +out: + /* update checksum */ + mpc->checksum = 0; + mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length); + + return 0; +} + +int enable_update_mptable; + +static int __init update_mptable_setup(char *str) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + return 0; +} +early_param("update_mptable", update_mptable_setup); + +static unsigned long __initdata mpc_new_phys; +static unsigned long mpc_new_length __initdata = 4096; + +/* alloc_mptable or alloc_mptable=4k */ +static int __initdata alloc_mptable; +static int __init parse_alloc_mptable_opt(char *p) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + alloc_mptable = 1; + if (!p) + return 0; + mpc_new_length = memparse(p, &p); + return 0; +} +early_param("alloc_mptable", parse_alloc_mptable_opt); + +void __init early_reserve_e820_mpc_new(void) +{ + if (enable_update_mptable && alloc_mptable) + mpc_new_phys = early_reserve_e820(mpc_new_length, 4); +} + +static int __init update_mp_table(void) +{ + char str[16]; + char oem[10]; + struct mpf_intel *mpf; + struct mpc_table *mpc, *mpc_new; + + if (!enable_update_mptable) + return 0; + + mpf = mpf_found; + if (!mpf) + return 0; + + /* + * Now see if we need to go further. + */ + if (mpf->feature1 != 0) + return 0; + + if (!mpf->physptr) + return 0; + + mpc = _bus_to_virt(mpf->physptr); + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + + printk(KERN_INFO "mpf: %llx\n", (u64)arbitrary_virt_to_machine(mpf)); + printk(KERN_INFO "physptr: %x\n", mpf->physptr); + + if (mpc_new_phys && mpc->length > mpc_new_length) { + mpc_new_phys = 0; + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); + } + + if (!mpc_new_phys) { + unsigned char old, new; + /* check if we can change the position */ + mpc->checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->length); + mpc->checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->length); + if (old == new) { + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); + return 0; + } + printk(KERN_INFO "use in-position replacing\n"); + } else { + maddr_t mpc_new_bus; + + mpc_new_bus = phys_to_machine(mpc_new_phys); + mpf->physptr = mpc_new_bus; + mpc_new = phys_to_virt(mpc_new_phys); + memcpy(mpc_new, mpc, mpc->length); + mpc = mpc_new; + /* check if we can modify that */ + if (mpc_new_bus - mpf->physptr) { + struct mpf_intel *mpf_new; + /* steal 16 bytes from [0, 1k) */ + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + mpf_new = isa_bus_to_virt(0x400 - 16); + memcpy(mpf_new, mpf, 16); + mpf = mpf_new; + mpf->physptr = mpc_new_bus; + } + mpf->checksum = 0; + mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); + printk(KERN_INFO "physptr new: %x\n", mpf->physptr); + } + + /* + * only replace the one with mp_INT and + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, + * already in mp_irqs , stored by ... and mp_config_acpi_gsi, + * may need pci=routeirq for all coverage + */ + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); + + return 0; +} + +late_initcall(update_mp_table); diff --git a/arch/x86/kernel/msr-xen.c b/arch/x86/kernel/msr-xen.c new file mode 100644 index 0000000..900a894 --- /dev/null +++ b/arch/x86/kernel/msr-xen.c @@ -0,0 +1,337 @@ +#ifndef CONFIG_XEN_PRIVILEGED_GUEST +#include "msr.c" +#else +/* ----------------------------------------------------------------------- * + * + * Copyright 2010 Novell, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * x86 MSR access device + * + * This device is accessed by lseek() to the appropriate register number + * and then read/write in chunks of 8 bytes. A larger size means multiple + * reads or writes of the same register. + * + * This driver uses /dev/xen/cpu/%d/msr where %d correlates to the minor + * number, and on an SMP box will direct the access to pCPU %d. + */ + +static int msr_init(void); +static void msr_exit(void); + +#define msr_init(args...) _msr_init(args) +#define msr_exit(args...) _msr_exit(args) +#include "msr.c" +#undef msr_exit +#undef msr_init + +#include +#include + +static struct class *pmsr_class; +static unsigned int minor_bias = 10; +static unsigned int nr_xen_cpu_ids; +static unsigned long *xen_cpu_online_map; + +#define PMSR_DEV(cpu) MKDEV(MSR_MAJOR, (cpu) + minor_bias) + +static unsigned int pmsr_minor(struct inode *inode) +{ + return iminor(inode) - minor_bias; +} + +static ssize_t pmsr_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u32 __user *tmp = (u32 __user *) buf; + u32 data[2]; + u32 reg = *ppos; + unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode); + int err = 0; + ssize_t bytes = 0; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + err = rdmsr_safe_on_pcpu(cpu, reg, &data[0], &data[1]); + if (err) + break; + if (copy_to_user(tmp, &data, 8)) { + err = -EFAULT; + break; + } + tmp += 2; + bytes += 8; + } + + return bytes ? bytes : err; +} + +static ssize_t pmsr_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + const u32 __user *tmp = (const u32 __user *)buf; + u32 data[2]; + u32 reg = *ppos; + unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode); + int err = 0; + ssize_t bytes = 0; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + if (copy_from_user(&data, tmp, 8)) { + err = -EFAULT; + break; + } + err = wrmsr_safe_on_pcpu(cpu, reg, data[0], data[1]); + if (err) + break; + tmp += 2; + bytes += 8; + } + + return bytes ? bytes : err; +} + +static long pmsr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) +{ + u32 __user *uregs = (u32 __user *)arg; + u32 regs[8]; + unsigned int cpu = pmsr_minor(file->f_path.dentry->d_inode); + int err; + + switch (ioc) { + case X86_IOC_RDMSR_REGS: + if (!(file->f_mode & FMODE_READ)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = rdmsr_safe_regs_on_pcpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + case X86_IOC_WRMSR_REGS: + if (!(file->f_mode & FMODE_WRITE)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = wrmsr_safe_regs_on_pcpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + default: + err = -ENOTTY; + break; + } + + return err; +} + +static int pmsr_open(struct inode *inode, struct file *file) +{ + unsigned int cpu; + + cpu = pmsr_minor(file->f_path.dentry->d_inode); + if (cpu >= nr_xen_cpu_ids || !test_bit(cpu, xen_cpu_online_map)) + return -ENXIO; /* No such CPU */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations pmsr_fops = { + .owner = THIS_MODULE, + .llseek = msr_seek, + .read = pmsr_read, + .write = pmsr_write, + .open = pmsr_open, + .unlocked_ioctl = pmsr_ioctl, + .compat_ioctl = pmsr_ioctl, +}; + +static int pmsr_device_create(unsigned int cpu) +{ + struct device *dev; + + if (cpu >= nr_xen_cpu_ids) { + static bool warned; + unsigned long *map; + + if ((minor_bias + cpu) >> MINORBITS) { + if (!warned) { + warned = true; + pr_warn("Physical MSRs of CPUs beyond %u" + " will not be accessible\n", + MINORMASK - minor_bias); + } + return -EDOM; + } + + map = kcalloc(BITS_TO_LONGS(cpu + 1), sizeof(*map), + GFP_KERNEL); + if (!map) { + if (!warned) { + warned = true; + pr_warn("Physical MSRs of CPUs beyond %u" + " may not be accessible\n", + nr_xen_cpu_ids - 1); + } + return -ENOMEM; + } + + memcpy(map, xen_cpu_online_map, + BITS_TO_LONGS(nr_xen_cpu_ids) + * sizeof(*xen_cpu_online_map)); + nr_xen_cpu_ids = min_t(unsigned int, + BITS_TO_LONGS(cpu + 1) * BITS_PER_LONG, + MINORMASK + 1 - minor_bias); + kfree(xchg(&xen_cpu_online_map, map)); + } + set_bit(cpu, xen_cpu_online_map); + dev = device_create(pmsr_class, NULL, PMSR_DEV(cpu), NULL, + "pmsr%d", cpu); + return IS_ERR(dev) ? PTR_ERR(dev) : 0; +} + +static void pmsr_device_destroy(unsigned int cpu) +{ + clear_bit(cpu, xen_cpu_online_map); + device_destroy(pmsr_class, PMSR_DEV(cpu)); +} + +static int pmsr_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + pmsr_device_create(cpu); + break; + case CPU_DEAD: + pmsr_device_destroy(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block pmsr_cpu_notifier = { + .notifier_call = pmsr_cpu_callback, +}; + +static char *pmsr_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "xen/cpu/%u/msr", + MINOR(dev->devt) - minor_bias); +} + +static int __init msr_init(void) +{ + int err; + xen_platform_op_t op; + + err = _msr_init(); + if (err || !is_initial_xendomain()) + return err; + + op.cmd = XENPF_get_cpuinfo; + op.u.pcpu_info.xen_cpuid = 0; + do { + err = HYPERVISOR_platform_op(&op); + } while (err == -EBUSY); + if (err) + goto out; + nr_xen_cpu_ids = BITS_TO_LONGS(op.u.pcpu_info.max_present + 1) + * BITS_PER_LONG; + + while (minor_bias < NR_CPUS) + minor_bias *= 10; + if ((minor_bias + nr_xen_cpu_ids - 1) >> MINORBITS) + minor_bias = NR_CPUS; + if ((minor_bias + nr_xen_cpu_ids - 1) >> MINORBITS) + nr_xen_cpu_ids = MINORMASK + 1 - NR_CPUS; + + xen_cpu_online_map = kcalloc(BITS_TO_LONGS(nr_xen_cpu_ids), + sizeof(*xen_cpu_online_map), + GFP_KERNEL); + if (!xen_cpu_online_map) { + err = -ENOMEM; + goto out; + } + + if (__register_chrdev(MSR_MAJOR, minor_bias, + MINORMASK + 1 - minor_bias, + "pcpu/msr", &pmsr_fops)) { + pr_err("msr: unable to get minors for pmsr\n"); + goto out; + } + pmsr_class = class_create(THIS_MODULE, "pmsr"); + if (IS_ERR(pmsr_class)) { + err = PTR_ERR(pmsr_class); + goto out_chrdev; + } + pmsr_class->devnode = pmsr_devnode; + err = register_pcpu_notifier(&pmsr_cpu_notifier); + + if (!err && !nr_xen_cpu_ids) + err = -ENODEV; + if (!err) + return 0; + + class_destroy(pmsr_class); + +out_chrdev: + __unregister_chrdev(MSR_MAJOR, minor_bias, + MINORMASK + 1 - minor_bias, "pcpu/msr"); +out: + if (err) + pr_warn("msr: can't initialize physical MSR access (%d)\n", + err); + nr_xen_cpu_ids = 0; + kfree(xen_cpu_online_map); + return 0; +} + +static void __exit msr_exit(void) +{ + if (nr_xen_cpu_ids) { + unsigned int cpu = 0; + + unregister_pcpu_notifier(&pmsr_cpu_notifier); + for_each_set_bit(cpu, xen_cpu_online_map, nr_xen_cpu_ids) + pmsr_device_destroy(cpu); + class_destroy(pmsr_class); + __unregister_chrdev(MSR_MAJOR, minor_bias, + MINORMASK + 1 - minor_bias, "pcpu/msr"); + kfree(xen_cpu_online_map); + } + _msr_exit(); +} +#endif /* CONFIG_XEN_PRIVILEGED_GUEST */ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 47acaf3..0509542 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -232,15 +232,12 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Dazed and confused, but trying to continue\n"); /* Clear and disable the PCI SERR error line. */ - reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; - outb(reason, NMI_REASON_PORT); + clear_serr_error(reason); } static notrace __kprobes void io_check_error(unsigned char reason, struct pt_regs *regs) { - unsigned long i; - pr_emerg( "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -250,17 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) panic("NMI IOCK error: Not continuing"); /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; - outb(reason, NMI_REASON_PORT); - - i = 20000; - while (--i) { - touch_nmi_watchdog(); - udelay(100); - } - - reason &= ~NMI_REASON_CLEAR_IOCHK; - outb(reason, NMI_REASON_PORT); + clear_io_check_error(reason); } static notrace __kprobes void diff --git a/arch/x86/kernel/pci-dma-xen.c b/arch/x86/kernel/pci-dma-xen.c new file mode 100644 index 0000000..14b72eb --- /dev/null +++ b/arch/x86/kernel/pci-dma-xen.c @@ -0,0 +1,366 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static int forbid_dac __read_mostly; + +struct dma_map_ops *dma_ops = &nommu_dma_ops; +EXPORT_SYMBOL(dma_ops); + +static int iommu_sac_force __read_mostly; + +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __initdata = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __initdata = 0; +#endif + +int iommu_merge __initdata; + +int no_iommu __initdata; +#ifndef CONFIG_XEN +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA translation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user wants to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; + +/* + * Group multi-function PCI devices into a single device-group for the + * iommu_device_group interface. This tells the iommu driver to pretend + * it cannot distinguish between functions of a device, exposing only one + * group for the device. Useful for disallowing use of individual PCI + * functions from userspace drivers. + */ +int iommu_group_mf __read_mostly; +#endif + +extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; + +/* Dummy device used for NULL arguments (normally ISA). */ +struct device x86_dma_fallback_dev = { + .init_name = "fallback device", + .coherent_dma_mask = ISA_DMA_BIT_MASK, + .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, +}; +EXPORT_SYMBOL(x86_dma_fallback_dev); + +/* Number of entries preallocated for DMA-API debugging */ +#define PREALLOC_DMA_DEBUG_ENTRIES 32768 + +int dma_set_mask(struct device *dev, u64 mask) +{ + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + + *dev->dma_mask = mask; + + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +static struct dma_map_ops swiotlb_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .mapping_error = swiotlb_dma_mapping_error, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .dma_supported = swiotlb_dma_supported +}; + +static int __init pci_xen_swiotlb_detect(void) +{ + return 1; +} + +static void __init pci_xen_swiotlb_init(void) +{ + swiotlb_init(1); + if (swiotlb) { + printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + dma_ops = &swiotlb_dma_ops; + } +} + +IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, NULL, pci_xen_swiotlb_init, NULL); + +void __init pci_iommu_alloc(void) +{ + struct iommu_table_entry *p; + + sort_iommu_table(__iommu_table, __iommu_table_end); + check_iommu_entries(__iommu_table, __iommu_table_end); + + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && p->detect && p->detect() > 0) { + p->flags |= IOMMU_DETECTED; + if (p->early_init) + p->early_init(); + if (p->flags & IOMMU_FINISH_IF_DETECTED) + break; + } + } +} +void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) +{ + unsigned long dma_mask; + struct page *page; +#ifndef CONFIG_XEN + dma_addr_t addr; +#else + void *memory; +#endif + unsigned int order = get_order(size); + + dma_mask = dma_alloc_coherent_mask(dev, flag); + +#ifndef CONFIG_XEN + flag |= __GFP_ZERO; +again: +#else + flag &= ~(__GFP_DMA | __GFP_DMA32); +#endif + page = alloc_pages_node(dev_to_node(dev), flag, order); + if (!page) + return NULL; + +#ifndef CONFIG_XEN + addr = page_to_phys(page); + if (addr + size > dma_mask) { + __free_pages(page, order); + + if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { + flag = (flag & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + return NULL; + } + + *dma_addr = addr; + return page_address(page); +#else + memory = page_address(page); + if (xen_create_contiguous_region((unsigned long)memory, order, + fls64(dma_mask))) { + __free_pages(page, order); + return NULL; + } + + *dma_addr = virt_to_bus(memory); + return memset(memory, 0, size); +#endif +} + +#ifdef CONFIG_XEN +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + unsigned int order = get_order(size); + unsigned long va = (unsigned long)vaddr; + + xen_destroy_contiguous_region(va, order); + free_pages(va, order); +} +#endif + +/* + * See for the iommu kernel + * parameter documentation. + */ +static __init int iommu_setup(char *p) +{ + iommu_merge = 1; + + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "off", 3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p, "force", 5)) + force_iommu = 1; + if (!strncmp(p, "noforce", 7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge", 8)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic", 5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic", 7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge", 5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge", 7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac", 8)) + iommu_sac_force = 1; + if (!strncmp(p, "allowdac", 8)) + forbid_dac = 0; + if (!strncmp(p, "nodac", 5)) + forbid_dac = 1; + if (!strncmp(p, "usedac", 6)) { + forbid_dac = -1; + return 1; + } +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft", 4)) + swiotlb = 1; +#endif +#ifndef CONFIG_XEN + if (!strncmp(p, "pt", 2)) + iommu_pass_through = 1; + if (!strncmp(p, "group_mf", 8)) + iommu_group_mf = 1; +#endif + + gart_parse_options(p); + +#ifdef CONFIG_CALGARY_IOMMU + if (!strncmp(p, "calgary", 7)) + use_calgary = 1; +#endif /* CONFIG_CALGARY_IOMMU */ + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} +early_param("iommu", iommu_setup); + +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; + + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; + } + return 1; +} + +int range_straddles_page_boundary(paddr_t p, size_t size) +{ + unsigned long pfn = p >> PAGE_SHIFT; + unsigned int offset = p & ~PAGE_MASK; + + return ((offset + size > PAGE_SIZE) && + !check_pages_physically_contiguous(pfn, offset, size)); +} + +int dma_supported(struct device *dev, u64 mask) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + +#ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { + dev_info(dev, "PCI: Disallowing DAC for device\n"); + return 0; + } +#endif + + if (ops->dma_supported) + return ops->dma_supported(dev, mask); + + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_BIT_MASK(24)) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. This + allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area and + return DAC as fallback address the device may not handle it + correctly. + + As a special case some controllers have a 39bit address + mode that is as efficient as 32bit (aic79xx). Don't force + SAC for these. Assume all masks <= 40 bits are of this + type. Normally this doesn't make any difference, but gives + more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { + dev_info(dev, "Force SAC with mask %Lx\n", mask); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +static int __init pci_iommu_init(void) +{ + struct iommu_table_entry *p; + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + +#ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); +#endif + x86_init.iommu.iommu_init(); + + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && (p->flags & IOMMU_DETECTED) && p->late_init) + p->late_init(); + } + + return 0; +} +/* Must execute after PCI subsystem */ +rootfs_initcall(pci_iommu_init); + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +static __devinit void via_no_dac(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); + forbid_dac = 1; + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +#endif diff --git a/arch/x86/kernel/pci-nommu-xen.c b/arch/x86/kernel/pci-nommu-xen.c new file mode 100644 index 0000000..9dc9d8e --- /dev/null +++ b/arch/x86/kernel/pci-nommu-xen.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define IOMMU_BUG_ON(test) \ +do { \ + if (unlikely(test)) { \ + printk(KERN_ALERT "Fatal DMA error! " \ + "Please use 'swiotlb=force'\n"); \ + BUG(); \ + } \ +} while (0) + +static int +gnttab_map_sg(struct device *hwdev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + unsigned int i; + struct scatterlist *sg; + + WARN_ON(nents == 0 || sgl->length == 0); + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + sg->dma_address = + gnttab_dma_map_page(sg_page(sg)) + sg->offset; + sg->dma_length = sg->length; + IOMMU_BUG_ON(!dma_capable( + hwdev, sg->dma_address, sg->length)); + IOMMU_BUG_ON(range_straddles_page_boundary( + page_to_pseudophys(sg_page(sg)) + sg->offset, + sg->length)); + } + + return nents; +} + +static void +gnttab_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + unsigned int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) + gnttab_dma_unmap_page(sg->dma_address); +} + +static dma_addr_t +gnttab_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + dma_addr_t dma; + + WARN_ON(size == 0); + + dma = gnttab_dma_map_page(page) + offset; + IOMMU_BUG_ON(range_straddles_page_boundary(page_to_pseudophys(page) + + offset, size)); + IOMMU_BUG_ON(!dma_capable(dev, dma, size)); + + return dma; +} + +static void +gnttab_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + gnttab_dma_unmap_page(dma_addr); +} + +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + +static int nommu_dma_supported(struct device *hwdev, u64 mask) +{ + return 1; +} + +struct dma_map_ops nommu_dma_ops = { + .alloc_coherent = dma_generic_alloc_coherent, + .free_coherent = dma_generic_free_coherent, + .map_page = gnttab_map_page, + .unmap_page = gnttab_unmap_page, + .map_sg = gnttab_map_sg, + .unmap_sg = gnttab_unmap_sg, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .dma_supported = nommu_dma_supported, +}; diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index a311ffc..965c549 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c @@ -6,6 +6,11 @@ static __init int add_pcspkr(void) { struct platform_device *pd; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return 0; +#endif + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); return IS_ERR(pd) ? PTR_ERR(pd) : 0; diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 34e06e8..5fb1971 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -114,6 +114,11 @@ static struct resource *find_oprom(struct pci_dev *pdev) struct resource *oprom = NULL; int i; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return NULL; +#endif + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { struct resource *res = &adapter_rom_resources[i]; unsigned short offset, vendor, device, list, rev; @@ -232,7 +237,7 @@ void __init probe_roms(void) upper = system_rom_resource.start; /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); + rom = isa_bus_to_virt((unsigned long)extension_rom_resource.start); if (romsignature(rom)) { length = resource_size(&extension_rom_resource); if (romchecksum(rom, length)) { diff --git a/arch/x86/kernel/process-xen.c b/arch/x86/kernel/process-xen.c new file mode 100644 index 0000000..521d602 --- /dev/null +++ b/arch/x86/kernel/process-xen.c @@ -0,0 +1,630 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); + +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + int ret; + + *dst = *src; + if (fpu_allocated(&src->thread.fpu)) { + memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); + ret = fpu_alloc(&dst->thread.fpu); + if (ret) + return ret; + fpu_copy(&dst->thread.fpu, &src->thread.fpu); + } + return 0; +} + +void free_thread_xstate(struct task_struct *tsk) +{ + fpu_free(&tsk->thread.fpu); +} + +void free_thread_info(struct thread_info *ti) +{ + free_thread_xstate(ti->task); + free_pages((unsigned long)ti, THREAD_ORDER); +} + +void arch_task_cache_init(void) +{ + task_xstate_cachep = + kmem_cache_create("task_xstate", xstate_size, + __alignof__(union thread_xstate), + SLAB_PANIC | SLAB_NOTRACK, NULL); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + unsigned long *bp = t->io_bitmap_ptr; + + if (bp) { + struct physdev_set_iobitmap set_iobitmap; + + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); + /* + * Careful, clear this in the TSS too: + */ + memset(&set_iobitmap, 0, sizeof(set_iobitmap)); + WARN_ON(HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, + &set_iobitmap)); + t->io_bitmap_max = 0; + kfree(bp); + } +} + +void show_regs(struct pt_regs *regs) +{ + show_registers(regs); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); +} + +void show_regs_common(void) +{ + const char *vendor, *product, *board; + + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + if (!vendor) + vendor = ""; + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product) + product = ""; + + /* Board Name is optional */ + board = dmi_get_system_info(DMI_BOARD_NAME); + + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + printk(KERN_CONT " %s %s", vendor, product); + if (board) + printk(KERN_CONT "/%s", board); + printk(KERN_CONT "\n"); +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + flush_ptrace_hw_breakpoint(tsk); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + tsk->fpu_counter = 0; + clear_fpu(tsk); + clear_used_math(); +} + +static void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} + +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} + +static void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_enable_TSC(); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread; + next = &next_p->thread; + + if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ + test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) + debugctl |= DEBUGCTLMSR_BTF; + + update_debugctlmsr(debugctl); + } + + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } + propagate_user_return_notify(prev_p, next_p); +} + +int sys_fork(struct pt_regs *regs) +{ + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +int sys_vfork(struct pt_regs *regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, + NULL, NULL); +} + +long +sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) +{ + if (!newsp) + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); +} + +/* + * This gets run with %si containing the + * function to call, and %di containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.si = (unsigned long) fn; + regs.di = (unsigned long) arg; + +#ifdef CONFIG_X86_32 + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; +#else + regs.ss = __KERNEL_DS; +#endif + + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + +/* + * sys_execve() executes a new program. + */ +long sys_execve(const char __user *name, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) +{ + long error; + char *filename; + + filename = getname(name); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + return error; + error = do_execve(filename, argv, envp, regs); + +#ifdef CONFIG_X86_32 + if (error == 0) { + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } +#endif + + putname(filename); + return error; +} + +/* + * Idle related variables and functions + */ +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; +EXPORT_SYMBOL(boot_option_idle_override); + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(pm_idle); +#endif + +/* + * We use this if we don't have any better + * idle routine.. + */ +void xen_idle(void) +{ + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); + + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif + +bool __init set_pm_idle_to_default(void) +{ + bool ret = !!pm_idle; + + pm_idle = xen_idle; + + return ret; +} +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + set_cpu_online(smp_processor_id(), false); + disable_all_local_evtchn(); + + for (;;) { + if (hlt_works(smp_processor_id())) + halt(); + } +} + +static void do_nothing(void *unused) +{ +} + +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ +void cpu_idle_wait(void) +{ + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 1); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +#ifndef CONFIG_XEN +/* Default MONITOR/MWAIT with no hints, used for default C1 state */ +static void mwait_idle(void) +{ + if (!need_resched()) { + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) + clflush((void *)¤t_thread_info()->flags); + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + } else + local_irq_enable(); +} +#endif + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->work.need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle(void) +{ + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); + trace_cpu_idle(0, smp_processor_id()); + local_irq_enable(); + while (!need_resched()) + cpu_relax(); + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); +} + +#ifndef CONFIG_XEN +/* + * mwait selection logic: + * + * It depends on the CPU. For AMD CPUs that support MWAIT this is + * wrong. Family 0x10 and 0x11 CPUs will enter C1 on HLT. Powersavings + * then depend on a clock divisor and current Pstate of the core. If + * all cores of a processor are in halt state (C1) the processor can + * enter the C1E (C1 enhanced) state. If mwait is used this will never + * happen. + * + * idle=mwait overrides this decision and forces the usage of mwait. + */ + +#define MWAIT_INFO 0x05 +#define MWAIT_ECX_EXTENDED_INFO 0x01 +#define MWAIT_EDX_C1 0xf0 + +int mwait_usable(const struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + + if (boot_option_idle_override == IDLE_FORCE_MWAIT) + return 1; + + if (c->cpuid_level < MWAIT_INFO) + return 0; + + cpuid(MWAIT_INFO, &eax, &ebx, &ecx, &edx); + /* Check, whether EDX has extended info about MWAIT */ + if (!(ecx & MWAIT_ECX_EXTENDED_INFO)) + return 1; + + /* + * edx enumeratios MONITOR/MWAIT extensions. Check, whether + * C1 supports MWAIT + */ + return (edx & MWAIT_EDX_C1); +} + +bool amd_e400_c1e_detected; +EXPORT_SYMBOL(amd_e400_c1e_detected); + +static cpumask_var_t amd_e400_c1e_mask; + +void amd_e400_remove_cpu(int cpu) +{ + if (amd_e400_c1e_mask != NULL) + cpumask_clear_cpu(cpu, amd_e400_c1e_mask); +} + +/* + * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt + * pending message MSR. If we detect C1E, then we handle it the same + * way as C3 power states (local apic timer and TSC stop) + */ +static void amd_e400_idle(void) +{ + if (need_resched()) + return; + + if (!amd_e400_c1e_detected) { + u32 lo, hi; + + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + + if (lo & K8_INTP_C1E_ACTIVE_MASK) { + amd_e400_c1e_detected = true; + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + printk(KERN_INFO "System has AMD C1E enabled\n"); + } + } + + if (amd_e400_c1e_detected) { + int cpu = smp_processor_id(); + + if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { + cpumask_set_cpu(cpu, amd_e400_c1e_mask); + /* + * Force broadcast so ACPI can not interfere. + */ + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, + &cpu); + printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", + cpu); + } + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + + default_idle(); + + /* + * The switch back from broadcast mode needs to be + * called with interrupts disabled. + */ + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); + } else + default_idle(); +} +#endif + +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +{ +#ifndef CONFIG_XEN +#ifdef CONFIG_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (pm_idle) + return; + + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { + /* + * One CPU supports mwait => All CPUs supports mwait + */ + printk(KERN_INFO "using mwait in idle threads.\n"); + pm_idle = mwait_idle; + } else if (cpu_has_amd_erratum(amd_erratum_400)) { + /* E400: APIC timer interrupt does not wake up CPU from C1e */ + printk(KERN_INFO "using AMD E400 aware idle routine\n"); + pm_idle = amd_e400_idle; + } else + pm_idle = default_idle; +#endif +} + +void __init init_amd_e400_c1e_mask(void) +{ +#ifndef CONFIG_XEN + /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ + if (pm_idle == amd_e400_idle) + zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); +#endif +} + +static int __init idle_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "poll")) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + boot_option_idle_override = IDLE_POLL; +#ifndef CONFIG_XEN + } else if (!strcmp(str, "mwait")) { + boot_option_idle_override = IDLE_FORCE_MWAIT; + WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n"); + } else if (!strcmp(str, "halt")) { + /* + * When the boot option of idle=halt is added, halt is + * forced to be used for CPU idle. In such case CPU C2/C3 + * won't be used again. + * To continue to load the CPU idle driver, don't touch + * the boot_option_idle_override. + */ + pm_idle = default_idle; + boot_option_idle_override = IDLE_HALT; + } else if (!strcmp(str, "nomwait")) { + /* + * If the boot option of "idle=nomwait" is added, + * it means that mwait will be disabled for CPU C2/C3 + * states. In such case it won't touch the variable + * of boot_option_idle_override. + */ + boot_option_idle_override = IDLE_NOMWAIT; +#endif + } else + return -1; + + return 0; +} +early_param("idle", idle_setup); + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32-xen.c b/arch/x86/kernel/process_32-xen.c new file mode 100644 index 0000000..08e8000 --- /dev/null +++ b/arch/x86/kernel/process_32-xen.c @@ -0,0 +1,446 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_MATH_EMULATION +#include +#endif + +#include + +#include + +#include +#include +#include +#include +#include +#include + +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +asmlinkage void cstar_ret_from_fork(void) __asm__("cstar_ret_from_fork"); + +/* + * Return saved PC of a blocked thread. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + return ((unsigned long *)tsk->thread.sp)[3]; +} + +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + int cpu = smp_processor_id(); + + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + + current_thread_info()->status |= TS_POLLING; + + /* endless idle loop with no priority at all */ + while (1) { + tick_nohz_idle_enter(); + rcu_idle_enter(); + while (!need_resched()) { + + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(cpu)) + play_dead(); + + local_touch_nmi(); + local_irq_disable(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); + if (cpuidle_idle_call()) + xen_idle(); + start_critical_timings(); + } + rcu_idle_exit(); + tick_nohz_idle_exit(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +void __show_regs(struct pt_regs *regs, int all) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned long sp; + unsigned short ss, gs; + + if (user_mode_vm(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + gs = get_user_gs(regs); + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); + savesegment(gs, gs); + } + + show_regs_common(); + + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + (u16)regs->cs, regs->ip, regs->flags, + smp_processor_id()); + print_symbol("EIP is at %s\n", regs->ip); + + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + regs->si, regs->di, regs->bp, sp); + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); + + if (!all) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4_safe(); + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + d0, d1, d2, d3); + + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", + d6, d7); +} + +void release_thread(struct task_struct *dead_task) +{ + BUG_ON(dead_task->mm); + release_vm86_irqs(dead_task); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(unsigned long clone_flags, unsigned long sp, + unsigned long unused, + struct task_struct *p, struct pt_regs *regs) +{ + struct pt_regs *childregs; + struct task_struct *tsk; + int err; + + childregs = task_pt_regs(p); + *childregs = *regs; + childregs->ax = 0; + childregs->sp = sp; + + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + + p->thread.ip = (unsigned long) ret_from_fork; + + task_user_gs(p) = get_user_gs(regs); + + p->fpu_counter = 0; + p->thread.io_bitmap_ptr = NULL; + tsk = current; + err = -ENOMEM; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + +#ifdef TIF_CSTAR + if (test_tsk_thread_flag(tsk, TIF_CSTAR)) + p->thread.ip = (unsigned long) cstar_ret_from_fork; +#endif + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + err = 0; + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); + + p->thread.iopl = current->thread.iopl; + + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + set_user_gs(regs, 0); + regs->fs = 0; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + regs->ip = new_ip; + regs->sp = new_sp; + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} +EXPORT_SYMBOL_GPL(start_thread); + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %ax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *tss = &per_cpu(init_tss, cpu); +#endif + fpu_switch_t fpu; +#if CONFIG_XEN_COMPAT > 0x030002 + struct physdev_set_iopl iopl_op; + struct physdev_set_iobitmap iobmp_op; +#else + struct physdev_op _pdo[2], *pdo = _pdo; +#define iopl_op pdo->u.set_iopl +#define iobmp_op pdo->u.set_iobitmap +#endif + multicall_entry_t _mcl[8], *mcl = _mcl; + + /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ + + fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl); + + /* + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. + */ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = __KERNEL_DS; + mcl->args[1] = next->sp0; + mcl++; + + /* + * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. + */ +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + *(u64 *)&mcl->args[0] = arbitrary_virt_to_machine( \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->iopl != next->iopl)) { + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iopl; + mcl->args[1] = (unsigned long)&iopl_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iopl; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + set_xen_guest_handle(iobmp_op.bitmap, + (char *)next->io_bitmap_ptr); + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iobitmap; + mcl->args[1] = (unsigned long)&iobmp_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iobitmap; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + +#if CONFIG_XEN_COMPAT <= 0x030002 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); +#endif + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) + BUG(); + + /* + * Now maybe handle debug registers + */ + if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || + task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) + __switch_to_xtra(prev_p, next_p); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_end_context_switch(next_p); + + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + lazy_load_gs(next->gs); + + switch_fpu_finish(next_p, fpu); + + percpu_write(current_task, next_p); + + return prev_p; +} + +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long bp, sp, ip; + unsigned long stack_page; + int count = 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack_page = (unsigned long)task_stack_page(p); + sp = p->thread.sp; + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) + return 0; + /* include/asm-i386/system.h:switch_to() pushes bp last. */ + bp = *(unsigned long *) sp; + do { + if (bp < stack_page || bp > top_ebp+stack_page) + return 0; + ip = *(unsigned long *) (bp+4); + if (!in_sched_functions(ip)) + return ip; + bp = *(unsigned long *) bp; + } while (count++ < 16); + return 0; +} + diff --git a/arch/x86/kernel/process_64-xen.c b/arch/x86/kernel/process_64-xen.c new file mode 100644 index 0000000..a9e3ce3 --- /dev/null +++ b/arch/x86/kernel/process_64-xen.c @@ -0,0 +1,695 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + * + * X86-64 port + * Andi Kleen. + * + * CPU hotplug support - ashok.raj@intel.com + * + * Jun Nakajima + * Modified for Xen + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage extern void ret_from_fork(void); + +static DEFINE_PER_CPU(unsigned char, is_idle); + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); + +void enter_idle(void) +{ + percpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} + +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + current_thread_info()->status |= TS_POLLING; + + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + + /* endless idle loop with no priority at all */ + while (1) { + tick_nohz_idle_enter(); + while (!need_resched()) { + + rmb(); + + if (cpu_is_offline(smp_processor_id())) + play_dead(); + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_touch_nmi(); + local_irq_disable(); + enter_idle(); + /* Don't trace irqs off for idle */ + stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + + if (cpuidle_idle_call()) + xen_idle(); + + rcu_idle_exit(); + start_critical_timings(); + + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ + __exit_idle(); + } + + tick_nohz_idle_exit(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs *regs, int all) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; + + show_regs_common(); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip, 1); + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + regs->sp, regs->flags); + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", + regs->ax, regs->bx, regs->cx); + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", + regs->dx, regs->si, regs->di); + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", + regs->bp, regs->r8, regs->r9); + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); + asm("mov %%fs,%0" : "=r" (fsindex)); + asm("mov %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + if (!all) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); + + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs, fsindex, gs, gsindex, shadowgs); + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + es, cr0); + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); +} + +void xen_load_gs_index(unsigned gs) +{ + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs)); +} +EXPORT_SYMBOL(xen_load_gs_index); + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + if (dead_task->mm->context.size) { + printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } +} + +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) +{ + struct user_desc ud = { + .base_addr = addr, + .limit = 0xfffff, + .seg_32bit = 1, + .limit_in_pages = 1, + .useable = 1, + }; + struct desc_struct *desc = t->thread.tls_array; + desc += tls; + fill_ldt(desc, &ud); +} + +static inline u32 read_32bit_tls(struct task_struct *t, int tls) +{ + return get_desc_base(&t->thread.tls_array[tls]); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(unsigned long clone_flags, unsigned long sp, + unsigned long unused, + struct task_struct *p, struct pt_regs *regs) +{ + int err; + struct pt_regs *childregs; + struct task_struct *me = current; + + childregs = ((struct pt_regs *) + (THREAD_SIZE + task_stack_page(p))) - 1; + *childregs = *regs; + + childregs->ax = 0; + if (user_mode(regs)) + childregs->sp = sp; + else + childregs->sp = (unsigned long)childregs; + + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + + set_tsk_thread_flag(p, TIF_FORK); + + p->fpu_counter = 0; + p->thread.io_bitmap_ptr = NULL; + + savesegment(gs, p->thread.gsindex); + p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; + savesegment(fs, p->thread.fsindex); + p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); + + err = -ENOMEM; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) + goto out; + } + p->thread.iopl = current->thread.iopl; + + err = 0; +out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + + return err; +} + +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) +{ + loadsegment(fs, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); + load_gs_index(0); + regs->ip = new_ip; + regs->sp = new_sp; + regs->cs = _cs; + regs->ss = _ss; + regs->flags = X86_EFLAGS_IF; + /* + * Free the old FP and other extended state + */ + free_thread_xstate(current); +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} + +#ifdef CONFIG_IA32_EMULATION +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER32_CS, __USER32_DS, __USER32_DS); +} +#endif + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. + */ +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; + int cpu = smp_processor_id(); +#ifndef CONFIG_X86_NO_TSS + struct tss_struct *tss = &per_cpu(init_tss, cpu); +#endif + fpu_switch_t fpu; +#if CONFIG_XEN_COMPAT > 0x030002 + struct physdev_set_iopl iopl_op; + struct physdev_set_iobitmap iobmp_op; +#else + struct physdev_op _pdo[2], *pdo = _pdo; +#define iopl_op pdo->u.set_iopl +#define iobmp_op pdo->u.set_iobitmap +#endif + multicall_entry_t _mcl[8], *mcl = _mcl; + + fpu = xen_switch_fpu_prepare(prev_p, next_p, cpu, &mcl); + + /* + * Reload sp0. + * This is load_sp0(tss, next) with a multicall. + */ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = __KERNEL_DS; + mcl->args[1] = next->sp0; + mcl++; + + /* + * Load the per-thread Thread-Local Storage descriptor. + * This is load_TLS(next, cpu) with multicalls. + */ +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + mcl->args[0] = arbitrary_virt_to_machine( \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + mcl->args[1] = *(u64 *)&next->tls_array[i]; \ + mcl++; \ + } \ +} while (0) + C(0); C(1); C(2); +#undef C + + if (unlikely(prev->iopl != next->iopl)) { + iopl_op.iopl = (next->iopl == 0) ? 1 : (next->iopl >> 12) & 3; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iopl; + mcl->args[1] = (unsigned long)&iopl_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iopl; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + set_xen_guest_handle(iobmp_op.bitmap, + (char *)next->io_bitmap_ptr); + iobmp_op.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; +#if CONFIG_XEN_COMPAT > 0x030002 + mcl->op = __HYPERVISOR_physdev_op; + mcl->args[0] = PHYSDEVOP_set_iobitmap; + mcl->args[1] = (unsigned long)&iobmp_op; +#else + mcl->op = __HYPERVISOR_physdev_op_compat; + pdo->cmd = PHYSDEVOP_set_iobitmap; + mcl->args[0] = (unsigned long)pdo++; +#endif + mcl++; + } + +#if CONFIG_XEN_COMPAT <= 0x030002 + BUG_ON(pdo > _pdo + ARRAY_SIZE(_pdo)); +#endif + BUG_ON(mcl > _mcl + ARRAY_SIZE(_mcl)); + if (unlikely(HYPERVISOR_multicall_check(_mcl, mcl - _mcl, NULL))) + BUG(); + + /* + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ + if (unlikely(next->es)) + loadsegment(es, next->es); + + if (unlikely(next->ds)) + loadsegment(ds, next->ds); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_end_context_switch(next_p); + + /* + * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. + */ + if (unlikely(next->fsindex)) + loadsegment(fs, next->fsindex); + + if (next->fs) + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs)); + + if (unlikely(next->gsindex)) + load_gs_index(next->gsindex); + + if (next->gs) + WARN_ON(HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs)); + + switch_fpu_finish(next_p, fpu); + + /* + * Switch the PDA context. + */ + percpu_write(current_task, next_p); + + percpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + + /* + * Now maybe reload the debug registers + */ + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) + __switch_to_xtra(prev_p, next_p); + + return prev_p; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit childs are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + current->personality |= force_personality32; + + /* Mark the associated mm as containing 32-bit tasks. */ + if (current->mm) + current->mm->context.ia32_compat = 1; + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack; + u64 fp, ip; + int count = 0; + + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack = (unsigned long)task_stack_page(p); + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) + return 0; + fp = *(u64 *)(p->thread.sp); + do { + if (fp < (unsigned long)stack || + fp >= (unsigned long)stack+THREAD_SIZE) + return 0; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; + fp = *(u64 *)fp; + } while (count++ < 16); + return 0; +} + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +{ + int ret = 0; + int doit = task == current; + int cpu; + + switch (code) { + case ARCH_SET_GS: + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + load_gs_index(GS_TLS_SEL); + } + task->thread.gsindex = GS_TLS_SEL; + task->thread.gs = 0; + } else { + task->thread.gsindex = 0; + task->thread.gs = addr; + if (doit) { + load_gs_index(0); + ret = HYPERVISOR_set_segment_base( + SEGBASE_GS_USER, addr); + } + } + put_cpu(); + break; + case ARCH_SET_FS: + /* Not strictly needed for fs, but do it for symmetry + with gs */ + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, FS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + loadsegment(fs, FS_TLS_SEL); + } + task->thread.fsindex = FS_TLS_SEL; + task->thread.fs = 0; + } else { + task->thread.fsindex = 0; + task->thread.fs = addr; + if (doit) { + /* set the selector to 0 to not confuse + __switch_to */ + loadsegment(fs, 0); + ret = HYPERVISOR_set_segment_base(SEGBASE_FS, + addr); + } + } + put_cpu(); + break; + case ARCH_GET_FS: { + unsigned long base; + if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); + else if (doit) + rdmsrl(MSR_FS_BASE, base); + else + base = task->thread.fs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + case ARCH_GET_GS: { + unsigned long base; + unsigned gsindex; + if (task->thread.gsindex == GS_TLS_SEL) + base = read_32bit_tls(task, GS_TLS); + else if (doit) { + savesegment(gs, gsindex); + if (gsindex) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gs; + } else + base = task->thread.gs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return do_arch_prctl(current, code, addr); +} + diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 03920a1..e22394c 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -4,9 +4,7 @@ #include #include -#include - -#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) +#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { @@ -34,10 +32,21 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) if (!(word & (1 << 13))) { dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " "disabling irq balancing and affinity\n"); +#ifndef CONFIG_XEN noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; #endif +#else + { + struct xen_platform_op op = { + .cmd = XENPF_platform_quirk, + .u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING + }; + + WARN_ON(HYPERVISOR_platform_op(&op)); + } +#endif } /* put back the original value for config space*/ @@ -53,6 +62,8 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, #endif #if defined(CONFIG_HPET_TIMER) +#include + unsigned long force_hpet_address; static enum { diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 36818f8..06ee365 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -87,14 +87,32 @@ relocate_kernel: movl PTR(PA_PGD)(%ebp), %eax movl %eax, %cr3 + /* setup idt */ + lidtl idt_48 - relocate_kernel(%edi) + + /* setup gdt */ + leal gdt - relocate_kernel(%edi), %eax + movl %eax, (gdt_48 - relocate_kernel) + 2(%edi) + lgdtl gdt_48 - relocate_kernel(%edi) + + /* setup data segment registers */ + mov $(gdt_ds - gdt), %eax + mov %eax, %ds + mov %eax, %es + mov %eax, %fs + mov %eax, %gs + mov %eax, %ss + /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%edi), %esp - /* jump to identity mapped page */ + /* load new code segment and jump to identity mapped page */ + pushl $0 + pushl $(gdt_cs - gdt) movl %edi, %eax addl $(identity_mapped - relocate_kernel), %eax pushl %eax - ret + iretl identity_mapped: /* set return address to 0 if not preserving context */ @@ -273,5 +291,22 @@ swap_pages: popl %ebp ret + .align 16 +gdt: + .quad 0x0000000000000000 /* NULL descriptor */ +gdt_cs: + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ +gdt_ds: + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ +gdt_end: + +gdt_48: + .word gdt_end - gdt - 1 /* limit */ + .long 0 /* base - filled in by code above */ + +idt_48: + .word 0 /* limit */ + .long 0 /* base */ + .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 7a6f3b3..b591fca 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -91,13 +91,30 @@ relocate_kernel: /* Switch to the identity mapped page tables */ movq %r9, %cr3 + /* setup idt */ + lidtq idt_80 - relocate_kernel(%r8) + + /* setup gdt */ + leaq gdt - relocate_kernel(%r8), %rax + movq %rax, (gdt_80 - relocate_kernel) + 2(%r8) + lgdtq gdt_80 - relocate_kernel(%r8) + + /* setup data segment registers */ + xorl %eax, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss + /* setup a new stack at the end of the physical control page */ lea PAGE_SIZE(%r8), %rsp - /* jump to identity mapped page */ + /* load new code segment and jump to identity mapped page */ addq $(identity_mapped - relocate_kernel), %r8 + pushq $(gdt_cs - gdt) pushq %r8 - ret + lretq identity_mapped: /* set return address to 0 if not preserving context */ @@ -264,5 +281,20 @@ swap_pages: 3: ret + .align 16 +gdt: + .quad 0x0000000000000000 /* NULL descriptor */ +gdt_cs: + .quad 0x00af9a000000ffff +gdt_end: + +gdt_80: + .word gdt_end - gdt - 1 /* limit */ + .quad 0 /* base - filled in by code above */ + +idt_80: + .word 0 /* limit */ + .quad 0 /* base */ + .globl kexec_control_code_size .set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 2a26819..7cc4a75 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,3 +1,7 @@ +#ifdef CONFIG_XEN +# define e820 machine_e820 +# include +#endif #include #include @@ -37,6 +41,10 @@ static void remove_e820_regions(struct resource *avail) void arch_remove_reservations(struct resource *avail) { +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return; +#endif /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ if (avail->flags & IORESOURCE_MEM) { if (avail->start < BIOS_END) diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index af6db6e..4598f0b 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -30,6 +30,7 @@ EXPORT_SYMBOL(cmos_lock); DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); +#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST /* * In order to set the CMOS clock precisely, set_rtc_mmss has to be * called 500 ms after the second nowtime has started, because when @@ -155,6 +156,7 @@ unsigned long mach_get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } +#endif /* CONFIG_XEN_UNPRIVILEGED_GUEST */ /* Routines for accessing the CMOS RAM/RTC. */ unsigned char rtc_cmos_read(unsigned char addr) @@ -202,6 +204,7 @@ unsigned long long native_read_tsc(void) EXPORT_SYMBOL(native_read_tsc); +#ifndef CONFIG_XEN_UNPRIVILEGED_GUEST static struct resource rtc_resources[] = { [0] = { .start = RTC_PORT(0), @@ -247,6 +250,11 @@ static __init int add_rtc_cmos(void) if (mrst_identify_cpu()) return -ENODEV; +#ifdef CONFIG_XEN + if (!is_initial_xendomain()) + return -ENODEV; +#endif + platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); @@ -254,3 +262,4 @@ static __init int add_rtc_cmos(void) return 0; } device_initcall(add_rtc_cmos); +#endif /* CONFIG_XEN_UNPRIVILEGED_GUEST */ diff --git a/arch/x86/kernel/setup-xen.c b/arch/x86/kernel/setup-xen.c new file mode 100644 index 0000000..52bbdfb --- /dev/null +++ b/arch/x86/kernel/setup-xen.c @@ -0,0 +1,1477 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons , July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle , February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel , March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include